From 5617c6cd6f844eaa2f4d61f165b7e6664a658865 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Mon, 9 May 2016 18:33:58 +0200 Subject: nl80211: Allow privileged operations from user namespaces While a wiphy can be transferred to network namespaces, a process having CAP_NET_ADMIN in a non-initial user namespace can not administrate such devices due to the genetlink GENL_ADMIN_PERM restrictions. For openvswitch having the same issue, a new GENL_UNS_ADMIN_PERM flag has been introduced, commit 4a92602aa1cd ("openvswitch: allow management from inside user namespaces"). This patch changes all privileged operations operating on a wiphy, dev or wdev to allow their administration using the same mechanism. All operations use either NEED_WIPHY, NEED_WDEV or NEED_NETDEV, which implies a namespace aware lookup of the device. The only exception is NL80211_CMD_SET_WIPHY, which explicitly uses a namespace aware phy lookup. Signed-off-by: Martin Willi [also allow cancel scan, for completeness] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 152 ++++++++++++++++++++++++------------------------- 1 file changed, 76 insertions(+), 76 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d7599014055d..bf75afa18699 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10945,7 +10945,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_WIPHY, .doit = nl80211_set_wiphy, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_RTNL, }, { @@ -10961,7 +10961,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_INTERFACE, .doit = nl80211_set_interface, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -10969,7 +10969,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_NEW_INTERFACE, .doit = nl80211_new_interface, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -10977,7 +10977,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_INTERFACE, .doit = nl80211_del_interface, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV | NL80211_FLAG_NEED_RTNL, }, @@ -10985,7 +10985,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_GET_KEY, .doit = nl80211_get_key, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -10993,7 +10993,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_KEY, .doit = nl80211_set_key, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, @@ -11002,7 +11002,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_NEW_KEY, .doit = nl80211_new_key, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, @@ -11011,14 +11011,14 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_KEY, .doit = nl80211_del_key, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_BEACON, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_set_beacon, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, @@ -11026,7 +11026,7 @@ static const struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_START_AP, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_start_ap, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, @@ -11034,7 +11034,7 @@ static const struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_STOP_AP, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_stop_ap, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, @@ -11051,7 +11051,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_STATION, .doit = nl80211_set_station, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11059,7 +11059,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_NEW_STATION, .doit = nl80211_new_station, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11067,7 +11067,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_STATION, .doit = nl80211_del_station, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11076,7 +11076,7 @@ static const struct genl_ops nl80211_ops[] = { .doit = nl80211_get_mpath, .dumpit = nl80211_dump_mpath, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11085,7 +11085,7 @@ static const struct genl_ops nl80211_ops[] = { .doit = nl80211_get_mpp, .dumpit = nl80211_dump_mpp, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11093,7 +11093,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_MPATH, .doit = nl80211_set_mpath, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11101,7 +11101,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_NEW_MPATH, .doit = nl80211_new_mpath, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11109,7 +11109,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_MPATH, .doit = nl80211_del_mpath, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11117,7 +11117,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_BSS, .doit = nl80211_set_bss, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11156,7 +11156,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_MESH_CONFIG, .doit = nl80211_update_mesh_config, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11164,7 +11164,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_TRIGGER_SCAN, .doit = nl80211_trigger_scan, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11172,7 +11172,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_ABORT_SCAN, .doit = nl80211_abort_scan, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11185,7 +11185,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_START_SCHED_SCAN, .doit = nl80211_start_sched_scan, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11193,7 +11193,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_STOP_SCHED_SCAN, .doit = nl80211_stop_sched_scan, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11201,7 +11201,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_AUTHENTICATE, .doit = nl80211_authenticate, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, @@ -11210,7 +11210,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_ASSOCIATE, .doit = nl80211_associate, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11218,7 +11218,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEAUTHENTICATE, .doit = nl80211_deauthenticate, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11226,7 +11226,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DISASSOCIATE, .doit = nl80211_disassociate, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11234,7 +11234,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_JOIN_IBSS, .doit = nl80211_join_ibss, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11242,7 +11242,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_LEAVE_IBSS, .doit = nl80211_leave_ibss, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11252,7 +11252,7 @@ static const struct genl_ops nl80211_ops[] = { .doit = nl80211_testmode_do, .dumpit = nl80211_testmode_dump, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11261,7 +11261,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_CONNECT, .doit = nl80211_connect, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11269,7 +11269,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DISCONNECT, .doit = nl80211_disconnect, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11277,7 +11277,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_WIPHY_NETNS, .doit = nl80211_wiphy_netns, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11290,7 +11290,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_PMKSA, .doit = nl80211_setdel_pmksa, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11298,7 +11298,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_PMKSA, .doit = nl80211_setdel_pmksa, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11306,7 +11306,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_FLUSH_PMKSA, .doit = nl80211_flush_pmksa, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11314,7 +11314,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_REMAIN_ON_CHANNEL, .doit = nl80211_remain_on_channel, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11322,7 +11322,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, .doit = nl80211_cancel_remain_on_channel, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11330,7 +11330,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_TX_BITRATE_MASK, .doit = nl80211_set_tx_bitrate_mask, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11338,7 +11338,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_REGISTER_FRAME, .doit = nl80211_register_mgmt, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11346,7 +11346,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_FRAME, .doit = nl80211_tx_mgmt, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11354,7 +11354,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_FRAME_WAIT_CANCEL, .doit = nl80211_tx_mgmt_cancel_wait, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11362,7 +11362,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_POWER_SAVE, .doit = nl80211_set_power_save, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11378,7 +11378,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_CQM, .doit = nl80211_set_cqm, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11386,7 +11386,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_CHANNEL, .doit = nl80211_set_channel, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11394,7 +11394,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_WDS_PEER, .doit = nl80211_set_wds_peer, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11402,7 +11402,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_JOIN_MESH, .doit = nl80211_join_mesh, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11410,7 +11410,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_LEAVE_MESH, .doit = nl80211_leave_mesh, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11418,7 +11418,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_JOIN_OCB, .doit = nl80211_join_ocb, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11426,7 +11426,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_LEAVE_OCB, .doit = nl80211_leave_ocb, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11443,7 +11443,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_WOWLAN, .doit = nl80211_set_wowlan, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11452,7 +11452,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_REKEY_OFFLOAD, .doit = nl80211_set_rekey_data, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | NL80211_FLAG_CLEAR_SKB, @@ -11461,7 +11461,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_TDLS_MGMT, .doit = nl80211_tdls_mgmt, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11469,7 +11469,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_TDLS_OPER, .doit = nl80211_tdls_oper, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11477,7 +11477,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_UNEXPECTED_FRAME, .doit = nl80211_register_unexpected_frame, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11485,7 +11485,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_PROBE_CLIENT, .doit = nl80211_probe_client, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11493,7 +11493,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_REGISTER_BEACONS, .doit = nl80211_register_beacons, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11501,7 +11501,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_NOACK_MAP, .doit = nl80211_set_noack_map, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11509,7 +11509,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_START_P2P_DEVICE, .doit = nl80211_start_p2p_device, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11517,7 +11517,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_STOP_P2P_DEVICE, .doit = nl80211_stop_p2p_device, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11525,7 +11525,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_MCAST_RATE, .doit = nl80211_set_mcast_rate, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11533,7 +11533,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_MAC_ACL, .doit = nl80211_set_mac_acl, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, @@ -11541,7 +11541,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_RADAR_DETECT, .doit = nl80211_start_radar_detection, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11554,7 +11554,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_UPDATE_FT_IES, .doit = nl80211_update_ft_ies, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11562,7 +11562,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_CRIT_PROTOCOL_START, .doit = nl80211_crit_protocol_start, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11570,7 +11570,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_CRIT_PROTOCOL_STOP, .doit = nl80211_crit_protocol_stop, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11585,7 +11585,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_COALESCE, .doit = nl80211_set_coalesce, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11593,7 +11593,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_CHANNEL_SWITCH, .doit = nl80211_channel_switch, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11602,7 +11602,7 @@ static const struct genl_ops nl80211_ops[] = { .doit = nl80211_vendor_cmd, .dumpit = nl80211_vendor_cmd_dump, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, @@ -11610,7 +11610,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_SET_QOS_MAP, .doit = nl80211_set_qos_map, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11618,7 +11618,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_ADD_TX_TS, .doit = nl80211_add_tx_ts, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11626,7 +11626,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_DEL_TX_TS, .doit = nl80211_del_tx_ts, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11634,7 +11634,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_TDLS_CHANNEL_SWITCH, .doit = nl80211_tdls_channel_switch, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, @@ -11642,7 +11642,7 @@ static const struct genl_ops nl80211_ops[] = { .cmd = NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH, .doit = nl80211_tdls_cancel_channel_switch, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, -- cgit v1.2.3 From 0bb7ed426373cebf406ff3ae7b6d2344f2f4364c Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Fri, 13 May 2016 11:29:33 -0700 Subject: mac80211: add vht cap decode to debugfs This makes it a lot easier to understand the capabilities used by the station: VHT supported cap: 0x300819b2 MAX-MPDU-11454 80Mhz RXLDPC SHORT-GI-80 TXSTBC RXSTBC_1 SU-BEAMFORMER-CAPABLE SU-BEAMFORMEE-CAPABLE BEAMFORMEE-STS: 0x0 SOUNDING-DIMENSIONS: 0x0 MU-BEAMFORMER-CAPABLE MPDU-LENGTH-EXPONENT: 0x0 LINK-ADAPTATION-VHT-MRQ-MFB: 0x0 RX-ANTENNA-PATTERN TX-ANTENNA-PATTERN RX MCS: fffe TX MCS: fffe Signed-off-by: Ben Greear Signed-off-by: Johannes Berg --- net/mac80211/debugfs_sta.c | 78 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 33dfcbc2bf9c..fd334133ff45 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -328,14 +328,88 @@ STA_OPS(ht_capa); static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { - char buf[128], *p = buf; + char buf[512], *p = buf; struct sta_info *sta = file->private_data; struct ieee80211_sta_vht_cap *vhtc = &sta->sta.vht_cap; p += scnprintf(p, sizeof(buf) + buf - p, "VHT %ssupported\n", vhtc->vht_supported ? "" : "not "); if (vhtc->vht_supported) { - p += scnprintf(p, sizeof(buf)+buf-p, "cap: %#.8x\n", vhtc->cap); + p += scnprintf(p, sizeof(buf) + buf - p, "cap: %#.8x\n", + vhtc->cap); +#define PFLAG(a, b) \ + do { \ + if (vhtc->cap & IEEE80211_VHT_CAP_ ## a) \ + p += scnprintf(p, sizeof(buf) + buf - p, \ + "\t\t%s\n", b); \ + } while (0) + + switch (vhtc->cap & 0x3) { + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tMAX-MPDU-3895\n"); + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tMAX-MPDU-7991\n"); + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tMAX-MPDU-11454\n"); + break; + default: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tMAX-MPDU-UNKNOWN\n"); + }; + switch (vhtc->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { + case 0: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\t80Mhz\n"); + break; + case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\t160Mhz\n"); + break; + case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\t80+80Mhz\n"); + break; + default: + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tUNKNOWN-MHZ: 0x%x\n", + (vhtc->cap >> 2) & 0x3); + }; + PFLAG(RXLDPC, "RXLDPC"); + PFLAG(SHORT_GI_80, "SHORT-GI-80"); + PFLAG(SHORT_GI_160, "SHORT-GI-160"); + PFLAG(TXSTBC, "TXSTBC"); + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tRXSTBC_%d\n", (vhtc->cap >> 8) & 0x7); + PFLAG(SU_BEAMFORMER_CAPABLE, "SU-BEAMFORMER-CAPABLE"); + PFLAG(SU_BEAMFORMEE_CAPABLE, "SU-BEAMFORMEE-CAPABLE"); + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tBEAMFORMEE-STS: 0x%x\n", + (vhtc->cap & IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK) >> + IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT); + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tSOUNDING-DIMENSIONS: 0x%x\n", + (vhtc->cap & IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK) + >> IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT); + PFLAG(MU_BEAMFORMER_CAPABLE, "MU-BEAMFORMER-CAPABLE"); + PFLAG(MU_BEAMFORMEE_CAPABLE, "MU-BEAMFORMEE-CAPABLE"); + PFLAG(VHT_TXOP_PS, "TXOP-PS"); + PFLAG(HTC_VHT, "HTC-VHT"); + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tMPDU-LENGTH-EXPONENT: 0x%x\n", + (vhtc->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK) >> + IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT); + PFLAG(VHT_LINK_ADAPTATION_VHT_UNSOL_MFB, + "LINK-ADAPTATION-VHT-UNSOL-MFB"); + p += scnprintf(p, sizeof(buf) + buf - p, + "\t\tLINK-ADAPTATION-VHT-MRQ-MFB: 0x%x\n", + (vhtc->cap & IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB) >> 26); + PFLAG(RX_ANTENNA_PATTERN, "RX-ANTENNA-PATTERN"); + PFLAG(TX_ANTENNA_PATTERN, "TX-ANTENNA-PATTERN"); p += scnprintf(p, sizeof(buf)+buf-p, "RX MCS: %.4x\n", le16_to_cpu(vhtc->vht_mcs.rx_mcs_map)); -- cgit v1.2.3 From bf1ecd210541ef5f3a110e88e8ca5d33b4aa5c23 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Tue, 31 May 2016 00:16:50 +0300 Subject: cfg80211: Allow cfg80211_connect_result() errors to be distinguished Previously, the status parameter to cfg80211_connect_result() was documented as using WLAN_STATUS_UNSPECIFIED_FAILURE (1) when the real status code for the failure is not known. This value can be used by an AP (and often is) and as such, user space cannot distinguish between explicitly rejected authentication/association and not being able to even try to associate or not receiving a response from the AP. Add a new inline function, cfg80211_connect_timeout(), to be used when the driver knows that the connection attempt failed due to a reason where connection could not be attempt or no response was received from the AP. The internal functions now allow a negative status value (-1) to be used as an indication of this special case. This results in the NL80211_ATTR_TIMED_OUT to be added to the NL80211_CMD_CONNECT event to allow user space to determine this case was hit. For backwards compatibility, NL80211_STATUS_CODE with the value WLAN_STATUS_UNSPECIFIED_FAILURE is still indicated in the event in such a case. Signed-off-by: Jouni Malinen [johannes: fix cfg80211_connect_bss() prototype to use int for status, add cfg80211_connect_timeout() to docbook, fix docbook] Signed-off-by: Johannes Berg --- net/wireless/core.h | 4 ++-- net/wireless/nl80211.c | 7 +++++-- net/wireless/nl80211.h | 2 +- net/wireless/sme.c | 8 +++----- 4 files changed, 11 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/wireless/core.h b/net/wireless/core.h index 025b7a5d508b..a4d547f99f8d 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -214,7 +214,7 @@ struct cfg80211_event { size_t req_ie_len; size_t resp_ie_len; struct cfg80211_bss *bss; - u16 status; + int status; /* -1 = failed; 0..65535 = status code */ } cr; struct { const u8 *req_ie; @@ -374,7 +374,7 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - u16 status, bool wextev, + int status, bool wextev, struct cfg80211_bss *bss); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bf75afa18699..03ac2ba8b174 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12092,7 +12092,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - u16 status, gfp_t gfp) + int status, gfp_t gfp) { struct sk_buff *msg; void *hdr; @@ -12110,7 +12110,10 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || (bssid && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) || - nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, status) || + nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, + status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE : + status) || + (status < 0 && nla_put_flag(msg, NL80211_ATTR_TIMED_OUT)) || (req_ie && nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) || (resp_ie && diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 84d4edf1d545..a63f402b10b7 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -55,7 +55,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - u16 status, gfp_t gfp); + int status, gfp_t gfp); void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 584fdc347221..add6824c44fd 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -244,9 +244,7 @@ void cfg80211_conn_work(struct work_struct *work) if (cfg80211_conn_do_work(wdev)) { __cfg80211_connect_result( wdev->netdev, bssid, - NULL, 0, NULL, 0, - WLAN_STATUS_UNSPECIFIED_FAILURE, - false, NULL); + NULL, 0, NULL, 0, -1, false, NULL); } wdev_unlock(wdev); } @@ -648,7 +646,7 @@ static DECLARE_WORK(cfg80211_disconnect_work, disconnect_work); void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - u16 status, bool wextev, + int status, bool wextev, struct cfg80211_bss *bss) { struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -757,7 +755,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, struct cfg80211_bss *bss, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, - size_t resp_ie_len, u16 status, gfp_t gfp) + size_t resp_ie_len, int status, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); -- cgit v1.2.3 From 019ae3a918811715192b22c400ac78d54acc26a9 Mon Sep 17 00:00:00 2001 From: "Kanchanapally, Vidyullatha" Date: Mon, 16 May 2016 10:41:04 +0530 Subject: cfg80211: Advertise extended capabilities per interface type to userspace The driver extended capabilities may differ for different interface types which the userspace needs to know (for example the fine timing measurement initiator and responder bits might differ for a station and AP). Add a new nl80211 attribute to provide extended capabilities per interface type to userspace. Signed-off-by: Vidyullatha Kanchanapally Reviewed-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/wireless/core.c | 30 ++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index d25c82bc1bbe..b8e10a952111 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -750,6 +750,36 @@ int wiphy_register(struct wiphy *wiphy) nl80211_send_reg_change_event(&request); } + /* Check that nobody globally advertises any capabilities they do not + * advertise on all possible interface types. + */ + if (wiphy->extended_capabilities_len && + wiphy->num_iftype_ext_capab && + wiphy->iftype_ext_capab) { + u8 supported_on_all, j; + const struct wiphy_iftype_ext_capab *capab; + + capab = wiphy->iftype_ext_capab; + for (j = 0; j < wiphy->extended_capabilities_len; j++) { + if (capab[0].extended_capabilities_len > j) + supported_on_all = + capab[0].extended_capabilities[j]; + else + supported_on_all = 0x00; + for (i = 1; i < wiphy->num_iftype_ext_capab; i++) { + if (j >= capab[i].extended_capabilities_len) { + supported_on_all = 0x00; + break; + } + supported_on_all &= + capab[i].extended_capabilities[j]; + } + if (WARN_ON(wiphy->extended_capabilities[j] & + ~supported_on_all)) + break; + } + } + rdev->wiphy.registered = true; rtnl_unlock(); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 03ac2ba8b174..d12044996a0e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1264,7 +1264,7 @@ nl80211_send_mgmt_stypes(struct sk_buff *msg, struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; - long split_start, band_start, chan_start; + long split_start, band_start, chan_start, capa_start; bool split; }; @@ -1761,6 +1761,47 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, nla_nest_end(msg, nested); } + state->split_start++; + break; + case 13: + if (rdev->wiphy.num_iftype_ext_capab && + rdev->wiphy.iftype_ext_capab) { + struct nlattr *nested_ext_capab, *nested; + + nested = nla_nest_start(msg, + NL80211_ATTR_IFTYPE_EXT_CAPA); + if (!nested) + goto nla_put_failure; + + for (i = state->capa_start; + i < rdev->wiphy.num_iftype_ext_capab; i++) { + const struct wiphy_iftype_ext_capab *capab; + + capab = &rdev->wiphy.iftype_ext_capab[i]; + + nested_ext_capab = nla_nest_start(msg, i); + if (!nested_ext_capab || + nla_put_u32(msg, NL80211_ATTR_IFTYPE, + capab->iftype) || + nla_put(msg, NL80211_ATTR_EXT_CAPA, + capab->extended_capabilities_len, + capab->extended_capabilities) || + nla_put(msg, NL80211_ATTR_EXT_CAPA_MASK, + capab->extended_capabilities_len, + capab->extended_capabilities_mask)) + goto nla_put_failure; + + nla_nest_end(msg, nested_ext_capab); + if (state->split) + break; + } + nla_nest_end(msg, nested); + if (i < rdev->wiphy.num_iftype_ext_capab) { + state->capa_start = i + 1; + break; + } + } + /* done */ state->split_start = 0; break; -- cgit v1.2.3 From 684ff4ef5edd758c47929b852b4ea79be56f8bc0 Mon Sep 17 00:00:00 2001 From: Zhang Shengju Date: Tue, 31 May 2016 13:41:02 +0000 Subject: ovs: set name assign type of internal port Set name_assign_type of internal port to NET_NAME_USER. Signed-off-by: Zhang Shengju Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/vport-internal_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 2ee48e447b72..434e04c3a189 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -195,7 +195,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) } vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, NET_NAME_UNKNOWN, do_setup); + parms->name, NET_NAME_USER, do_setup); if (!vport->dev) { err = -ENOMEM; goto error_free_vport; -- cgit v1.2.3 From 57c05650394b384605f5183747991d19ee543759 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 2 Jun 2016 15:05:39 -0300 Subject: skbuff: export skb_gro_receive sctp GSO requires it and sctp can be compiled as a module, so we need to export this function. Signed-off-by: Marcelo Ricardo Leitner Tested-by: Xin Long Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f2b77e549c03..4724bcf9b0ca 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3438,6 +3438,7 @@ done: NAPI_GRO_CB(skb)->same_flow = 1; return 0; } +EXPORT_SYMBOL_GPL(skb_gro_receive); void __init skb_init(void) { -- cgit v1.2.3 From 3953c46c3ac7eef31a9935427371c6f54a22f1ba Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 2 Jun 2016 15:05:40 -0300 Subject: sk_buff: allow segmenting based on frag sizes This patch allows segmenting a skb based on its frags sizes instead of based on a fixed value. Signed-off-by: Marcelo Ricardo Leitner Tested-by: Xin Long Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4724bcf9b0ca..97c32c75e704 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3116,9 +3116,13 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int hsize; int size; - len = head_skb->len - offset; - if (len > mss) - len = mss; + if (unlikely(mss == GSO_BY_FRAGS)) { + len = list_skb->len; + } else { + len = head_skb->len - offset; + if (len > mss) + len = mss; + } hsize = skb_headlen(head_skb) - offset; if (hsize < 0) -- cgit v1.2.3 From ae7ef81ef000adeee7a87585b9135ff8a8064acc Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 2 Jun 2016 15:05:41 -0300 Subject: skbuff: introduce skb_gso_validate_mtu skb_gso_network_seglen is not enough for checking fragment sizes if skb is using GSO_BY_FRAGS as we have to check frag per frag. This patch introduces skb_gso_validate_mtu, based on the former, which will wrap the use case inside it as all calls to skb_gso_network_seglen were to validate if it fits on a given TMU, and improve the check. Signed-off-by: Marcelo Ricardo Leitner Tested-by: Xin Long Signed-off-by: David S. Miller --- net/core/skbuff.c | 31 +++++++++++++++++++++++++++++++ net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- net/mpls/af_mpls.c | 2 +- 5 files changed, 35 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 97c32c75e704..5ca562b56ec3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4392,6 +4392,37 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) } EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); +/** + * skb_gso_validate_mtu - Return in case such skb fits a given MTU + * + * @skb: GSO skb + * + * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU + * once split. + */ +bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const struct sk_buff *iter; + unsigned int hlen; + + hlen = skb_gso_network_seglen(skb); + + if (shinfo->gso_size != GSO_BY_FRAGS) + return hlen <= mtu; + + /* Undo this so we can re-use header sizes */ + hlen -= GSO_BY_FRAGS; + + skb_walk_frags(skb, iter) { + if (hlen + skb_headlen(iter) > mtu) + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(skb_gso_validate_mtu); + static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { if (skb_cow(skb, skb_headroom(skb)) < 0) { diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index cbfb1808fcc4..9f0a7b96646f 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -54,7 +54,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) if (skb->ignore_df) return false; - if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) return false; return true; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 124bf0a66328..cbac493c913a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -225,7 +225,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || - skb_gso_network_seglen(skb) <= mtu) + skb_gso_validate_mtu(skb, mtu)) return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index cbf127ae7c67..6b2f60a5c1de 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -368,7 +368,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) if (skb->ignore_df) return false; - if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) return false; return true; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 0b80a7140cc4..7a4aa3450dd7 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -91,7 +91,7 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) if (skb->len <= mtu) return false; - if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) return false; return true; -- cgit v1.2.3 From 3acb50c18d8d6650f10919464ade4dcdaf41d62f Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 2 Jun 2016 15:05:42 -0300 Subject: sctp: delay as much as possible skb_linearize This patch is a preparation for the GSO one. In order to successfully handle GSO packets on rx path we must not call skb_linearize, otherwise it defeats any gain GSO may have had. This patch thus delays as much as possible the call to skb_linearize, leaving it to sctp_inq_pop() moment. For that the sanity checks performed now know how to deal with fragments. One positive side-effect of this is that if the socket is backlogged it will have the chance of doing it on backlog processing instead of during softirq. With this move, it's evident that a check for non-linearity in sctp_inq_pop was ineffective and is now removed. Note that a similar check is performed a bit below this one. Signed-off-by: Marcelo Ricardo Leitner Tested-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/input.c | 45 +++++++++++++++++++++++++-------------------- net/sctp/inqueue.c | 29 ++++++++++++++++++----------- 2 files changed, 43 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index a701527a9480..5cff2546c3dd 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -112,7 +112,6 @@ int sctp_rcv(struct sk_buff *skb) struct sctp_ep_common *rcvr; struct sctp_transport *transport = NULL; struct sctp_chunk *chunk; - struct sctphdr *sh; union sctp_addr src; union sctp_addr dest; int family; @@ -124,15 +123,18 @@ int sctp_rcv(struct sk_buff *skb) __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS); - if (skb_linearize(skb)) + /* If packet is too small to contain a single chunk, let's not + * waste time on it anymore. + */ + if (skb->len < sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + + skb_transport_offset(skb)) goto discard_it; - sh = sctp_hdr(skb); + if (!pskb_may_pull(skb, sizeof(struct sctphdr))) + goto discard_it; - /* Pull up the IP and SCTP headers. */ + /* Pull up the IP header. */ __skb_pull(skb, skb_transport_offset(skb)); - if (skb->len < sizeof(struct sctphdr)) - goto discard_it; skb->csum_valid = 0; /* Previous value not applicable */ if (skb_csum_unnecessary(skb)) @@ -141,11 +143,7 @@ int sctp_rcv(struct sk_buff *skb) goto discard_it; skb->csum_valid = 1; - skb_pull(skb, sizeof(struct sctphdr)); - - /* Make sure we at least have chunk headers worth of data left. */ - if (skb->len < sizeof(struct sctp_chunkhdr)) - goto discard_it; + __skb_pull(skb, sizeof(struct sctphdr)); family = ipver2af(ip_hdr(skb)->version); af = sctp_get_af_specific(family); @@ -230,7 +228,7 @@ int sctp_rcv(struct sk_buff *skb) chunk->rcvr = rcvr; /* Remember the SCTP header. */ - chunk->sctp_hdr = sh; + chunk->sctp_hdr = sctp_hdr(skb); /* Set the source and destination addresses of the incoming chunk. */ sctp_init_addrs(chunk, &src, &dest); @@ -660,19 +658,23 @@ out_unlock: */ static int sctp_rcv_ootb(struct sk_buff *skb) { - sctp_chunkhdr_t *ch; - __u8 *ch_end; - - ch = (sctp_chunkhdr_t *) skb->data; + sctp_chunkhdr_t *ch, _ch; + int ch_end, offset = 0; /* Scan through all the chunks in the packet. */ do { + /* Make sure we have at least the header there */ + if (offset + sizeof(sctp_chunkhdr_t) > skb->len) + break; + + ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); + /* Break out if chunk length is less then minimal. */ if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) break; - ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); - if (ch_end > skb_tail_pointer(skb)) + ch_end = offset + WORD_ROUND(ntohs(ch->length)); + if (ch_end > skb->len) break; /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the @@ -697,8 +699,8 @@ static int sctp_rcv_ootb(struct sk_buff *skb) if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data) goto discard; - ch = (sctp_chunkhdr_t *) ch_end; - } while (ch_end < skb_tail_pointer(skb)); + offset = ch_end; + } while (ch_end < skb->len); return 0; @@ -1173,6 +1175,9 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, { sctp_chunkhdr_t *ch; + if (skb_linearize(skb)) + return NULL; + ch = (sctp_chunkhdr_t *) skb->data; /* The code below will attempt to walk the chunk and extract diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 9d87bba0ff1d..5ba08ceda3ab 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -130,7 +130,8 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) * at this time. */ - if ((chunk = queue->in_progress)) { + chunk = queue->in_progress; + if (chunk) { /* There is a packet that we have been working on. * Any post processing work to do before we move on? */ @@ -152,15 +153,29 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) if (!chunk) { struct list_head *entry; +next_chunk: /* Is the queue empty? */ if (list_empty(&queue->in_chunk_list)) return NULL; entry = queue->in_chunk_list.next; - chunk = queue->in_progress = - list_entry(entry, struct sctp_chunk, list); + chunk = list_entry(entry, struct sctp_chunk, list); list_del_init(entry); + /* Linearize if it's not GSO */ + if (skb_is_nonlinear(chunk->skb)) { + if (skb_linearize(chunk->skb)) { + __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); + sctp_chunk_free(chunk); + goto next_chunk; + } + + /* Update sctp_hdr as it probably changed */ + chunk->sctp_hdr = sctp_hdr(chunk->skb); + } + + queue->in_progress = chunk; + /* This is the first chunk in the packet. */ chunk->singleton = 1; ch = (sctp_chunkhdr_t *) chunk->skb->data; @@ -172,14 +187,6 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) chunk->chunk_hdr = ch; chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); - /* In the unlikely case of an IP reassembly, the skb could be - * non-linear. If so, update chunk_end so that it doesn't go past - * the skb->tail. - */ - if (unlikely(skb_is_nonlinear(chunk->skb))) { - if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) - chunk->chunk_end = skb_tail_pointer(chunk->skb); - } skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ -- cgit v1.2.3 From 90017accff61ae89283ad9a51f9ac46ca01633fb Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 2 Jun 2016 15:05:43 -0300 Subject: sctp: Add GSO support SCTP has this pecualiarity that its packets cannot be just segmented to (P)MTU. Its chunks must be contained in IP segments, padding respected. So we can't just generate a big skb, set gso_size to the fragmentation point and deliver it to IP layer. This patch takes a different approach. SCTP will now build a skb as it would be if it was received using GRO. That is, there will be a cover skb with protocol headers and children ones containing the actual segments, already segmented to a way that respects SCTP RFCs. With that, we can tell skb_segment() to just split based on frag_list, trusting its sizes are already in accordance. This way SCTP can benefit from GSO and instead of passing several packets through the stack, it can pass a single large packet. v2: - Added support for receiving GSO frames, as requested by Dave Miller. - Clear skb->cb if packet is GSO (otherwise it's not used by SCTP) - Added heuristics similar to what we have in TCP for not generating single GSO packets that fills cwnd. v3: - consider sctphdr size in skb_gso_transport_seglen() - rebased due to 5c7cdf339af5 ("gso: Remove arbitrary checks for unsupported GSO") Signed-off-by: Marcelo Ricardo Leitner Tested-by: Xin Long Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + net/core/skbuff.c | 3 + net/sctp/Makefile | 3 +- net/sctp/input.c | 12 +- net/sctp/inqueue.c | 51 ++++++-- net/sctp/offload.c | 98 ++++++++++++++ net/sctp/output.c | 363 +++++++++++++++++++++++++++++++++++----------------- net/sctp/protocol.c | 3 + net/sctp/socket.c | 2 + 9 files changed, 412 insertions(+), 124 deletions(-) create mode 100644 net/sctp/offload.c (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f4034817d255..977489820eb9 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -89,6 +89,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", + [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5ca562b56ec3..b6e0f95bef36 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #ifdef CONFIG_NET_CLS_ACT #include @@ -4383,6 +4384,8 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) thlen += inner_tcp_hdrlen(skb); } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { thlen = tcp_hdrlen(skb); + } else if (unlikely(shinfo->gso_type & SKB_GSO_SCTP)) { + thlen = sizeof(struct sctphdr); } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 0fca5824ad0e..6c4f7496cec6 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -11,7 +11,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ transport.o chunk.o sm_make_chunk.o ulpevent.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ - output.o input.o debug.o ssnmap.o auth.o + output.o input.o debug.o ssnmap.o auth.o \ + offload.o sctp_probe-y := probe.o diff --git a/net/sctp/input.c b/net/sctp/input.c index 5cff2546c3dd..6f8e676d285e 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -139,7 +139,9 @@ int sctp_rcv(struct sk_buff *skb) skb->csum_valid = 0; /* Previous value not applicable */ if (skb_csum_unnecessary(skb)) __skb_decr_checksum_unnecessary(skb); - else if (!sctp_checksum_disable && sctp_rcv_checksum(net, skb) < 0) + else if (!sctp_checksum_disable && + !(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) && + sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb->csum_valid = 1; @@ -1175,6 +1177,14 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, { sctp_chunkhdr_t *ch; + /* We do not allow GSO frames here as we need to linearize and + * then cannot guarantee frame boundaries. This shouldn't be an + * issue as packets hitting this are mostly INIT or INIT-ACK and + * those cannot be on GSO-style anyway. + */ + if ((skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) + return NULL; + if (skb_linearize(skb)) return NULL; diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 5ba08ceda3ab..edabbbdfca54 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -138,6 +138,17 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) { + if (chunk->head_skb == chunk->skb) { + chunk->skb = skb_shinfo(chunk->skb)->frag_list; + goto new_skb; + } + if (chunk->skb->next) { + chunk->skb = chunk->skb->next; + goto new_skb; + } + + if (chunk->head_skb) + chunk->skb = chunk->head_skb; sctp_chunk_free(chunk); chunk = queue->in_progress = NULL; } else { @@ -155,15 +166,15 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) next_chunk: /* Is the queue empty? */ - if (list_empty(&queue->in_chunk_list)) + entry = sctp_list_dequeue(&queue->in_chunk_list); + if (!entry) return NULL; - entry = queue->in_chunk_list.next; chunk = list_entry(entry, struct sctp_chunk, list); - list_del_init(entry); /* Linearize if it's not GSO */ - if (skb_is_nonlinear(chunk->skb)) { + if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) != SKB_GSO_SCTP && + skb_is_nonlinear(chunk->skb)) { if (skb_linearize(chunk->skb)) { __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); sctp_chunk_free(chunk); @@ -174,15 +185,39 @@ next_chunk: chunk->sctp_hdr = sctp_hdr(chunk->skb); } + if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) { + /* GSO-marked skbs but without frags, handle + * them normally + */ + if (skb_shinfo(chunk->skb)->frag_list) + chunk->head_skb = chunk->skb; + + /* skbs with "cover letter" */ + if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) + chunk->skb = skb_shinfo(chunk->skb)->frag_list; + + if (WARN_ON(!chunk->skb)) { + __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); + sctp_chunk_free(chunk); + goto next_chunk; + } + } + + if (chunk->asoc) + sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb); + queue->in_progress = chunk; +new_skb: /* This is the first chunk in the packet. */ - chunk->singleton = 1; ch = (sctp_chunkhdr_t *) chunk->skb->data; + chunk->singleton = 1; chunk->data_accepted = 0; - - if (chunk->asoc) - sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb); + chunk->pdiscard = 0; + chunk->auth = 0; + chunk->has_asconf = 0; + chunk->end_of_packet = 0; + chunk->ecn_ce_done = 0; } chunk->chunk_hdr = ch; diff --git a/net/sctp/offload.c b/net/sctp/offload.c new file mode 100644 index 000000000000..a37887b373a7 --- /dev/null +++ b/net/sctp/offload.c @@ -0,0 +1,98 @@ +/* + * sctp_offload - GRO/GSO Offloading for SCTP + * + * Copyright (C) 2015, Marcelo Ricardo Leitner + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static __le32 sctp_gso_make_checksum(struct sk_buff *skb) +{ + skb->ip_summed = CHECKSUM_NONE; + return sctp_compute_cksum(skb, skb_transport_offset(skb)); +} + +static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct sctphdr *sh; + + sh = sctp_hdr(skb); + if (!pskb_may_pull(skb, sizeof(*sh))) + goto out; + + __skb_pull(skb, sizeof(*sh)); + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + struct skb_shared_info *pinfo = skb_shinfo(skb); + struct sk_buff *frag_iter; + + pinfo->gso_segs = 0; + if (skb->len != skb->data_len) { + /* Means we have chunks in here too */ + pinfo->gso_segs++; + } + + skb_walk_frags(skb, frag_iter) + pinfo->gso_segs++; + + segs = NULL; + goto out; + } + + segs = skb_segment(skb, features | NETIF_F_HW_CSUM); + if (IS_ERR(segs)) + goto out; + + /* All that is left is update SCTP CRC if necessary */ + if (!(features & NETIF_F_SCTP_CRC)) { + for (skb = segs; skb; skb = skb->next) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + sh = sctp_hdr(skb); + sh->checksum = sctp_gso_make_checksum(skb); + } + } + } + +out: + return segs; +} + +static const struct net_offload sctp_offload = { + .callbacks = { + .gso_segment = sctp_gso_segment, + }, +}; + +int __init sctp_offload_init(void) +{ + return inet_add_offload(&sctp_offload, IPPROTO_SCTP); +} diff --git a/net/sctp/output.c b/net/sctp/output.c index 9844fe573029..60499a69179d 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -84,18 +84,42 @@ static void sctp_packet_reset(struct sctp_packet *packet) struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, __u32 vtag, int ecn_capable) { - struct sctp_chunk *chunk = NULL; + struct sctp_transport *tp = packet->transport; + struct sctp_association *asoc = tp->asoc; pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); packet->vtag = vtag; + if (asoc && tp->dst) { + struct sock *sk = asoc->base.sk; + + rcu_read_lock(); + if (__sk_dst_get(sk) != tp->dst) { + dst_hold(tp->dst); + sk_setup_caps(sk, tp->dst); + } + + if (sk_can_gso(sk)) { + struct net_device *dev = tp->dst->dev; + + packet->max_size = dev->gso_max_size; + } else { + packet->max_size = asoc->pathmtu; + } + rcu_read_unlock(); + + } else { + packet->max_size = tp->pathmtu; + } + if (ecn_capable && sctp_packet_empty(packet)) { - chunk = sctp_get_ecne_prepend(packet->transport->asoc); + struct sctp_chunk *chunk; /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ + chunk = sctp_get_ecne_prepend(asoc); if (chunk) sctp_packet_append_chunk(packet, chunk); } @@ -381,12 +405,15 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctphdr *sh; - struct sk_buff *nskb; + struct sk_buff *nskb = NULL, *head = NULL; struct sctp_chunk *chunk, *tmp; struct sock *sk; int err = 0; int padding; /* How much padding do we need? */ + int pkt_size; __u8 has_data = 0; + int gso = 0; + int pktcount = 0; struct dst_entry *dst; unsigned char *auth = NULL; /* pointer to auth in skb data */ @@ -400,18 +427,37 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; - /* Allocate the new skb. */ - nskb = alloc_skb(packet->size + MAX_HEADER, gfp); - if (!nskb) + /* Allocate the head skb, or main one if not in GSO */ + if (packet->size > tp->pathmtu && !packet->ipfragok) { + if (sk_can_gso(sk)) { + gso = 1; + pkt_size = packet->overhead; + } else { + /* If this happens, we trash this packet and try + * to build a new one, hopefully correct this + * time. Application may notice this error. + */ + pr_err_once("Trying to GSO but underlying device doesn't support it."); + goto nomem; + } + } else { + pkt_size = packet->size; + } + head = alloc_skb(pkt_size + MAX_HEADER, gfp); + if (!head) goto nomem; + if (gso) { + NAPI_GRO_CB(head)->last = head; + skb_shinfo(head)->gso_type = sk->sk_gso_type; + } /* Make sure the outbound skb has enough header room reserved. */ - skb_reserve(nskb, packet->overhead + MAX_HEADER); + skb_reserve(head, packet->overhead + MAX_HEADER); /* Set the owning socket so that we know where to get the * destination IP address. */ - sctp_packet_set_owner_w(nskb, sk); + sctp_packet_set_owner_w(head, sk); if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sctp_sk(sk)); @@ -422,11 +468,11 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) dst = dst_clone(tp->dst); if (!dst) goto no_route; - skb_dst_set(nskb, dst); + skb_dst_set(head, dst); /* Build the SCTP header. */ - sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); - skb_reset_transport_header(nskb); + sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr)); + skb_reset_transport_header(head); sh->source = htons(packet->source_port); sh->dest = htons(packet->destination_port); @@ -441,90 +487,133 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) sh->vtag = htonl(packet->vtag); sh->checksum = 0; - /** - * 6.10 Bundling - * - * An endpoint bundles chunks by simply including multiple - * chunks in one outbound SCTP packet. ... - */ - - /** - * 3.2 Chunk Field Descriptions - * - * The total length of a chunk (including Type, Length and - * Value fields) MUST be a multiple of 4 bytes. If the length - * of the chunk is not a multiple of 4 bytes, the sender MUST - * pad the chunk with all zero bytes and this padding is not - * included in the chunk length field. The sender should - * never pad with more than 3 bytes. - * - * [This whole comment explains WORD_ROUND() below.] - */ - pr_debug("***sctp_transmit_packet***\n"); - list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { - list_del_init(&chunk->list); - if (sctp_chunk_is_data(chunk)) { - /* 6.3.1 C4) When data is in flight and when allowed - * by rule C5, a new RTT measurement MUST be made each - * round trip. Furthermore, new RTT measurements - * SHOULD be made no more than once per round-trip - * for a given destination transport address. - */ + do { + /* Set up convenience variables... */ + chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); + pktcount++; - if (!chunk->resent && !tp->rto_pending) { - chunk->rtt_in_progress = 1; - tp->rto_pending = 1; + /* Calculate packet size, so it fits in PMTU. Leave + * other chunks for the next packets. + */ + if (gso) { + pkt_size = packet->overhead; + list_for_each_entry(chunk, &packet->chunk_list, list) { + int padded = WORD_ROUND(chunk->skb->len); + + if (pkt_size + padded > tp->pathmtu) + break; + pkt_size += padded; } - has_data = 1; - } + /* Allocate a new skb. */ + nskb = alloc_skb(pkt_size + MAX_HEADER, gfp); + if (!nskb) + goto nomem; - padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; - if (padding) - memset(skb_put(chunk->skb, padding), 0, padding); + /* Make sure the outbound skb has enough header + * room reserved. + */ + skb_reserve(nskb, packet->overhead + MAX_HEADER); + } else { + nskb = head; + } - /* if this is the auth chunk that we are adding, - * store pointer where it will be added and put - * the auth into the packet. + /** + * 3.2 Chunk Field Descriptions + * + * The total length of a chunk (including Type, Length and + * Value fields) MUST be a multiple of 4 bytes. If the length + * of the chunk is not a multiple of 4 bytes, the sender MUST + * pad the chunk with all zero bytes and this padding is not + * included in the chunk length field. The sender should + * never pad with more than 3 bytes. + * + * [This whole comment explains WORD_ROUND() below.] */ - if (chunk == packet->auth) - auth = skb_tail_pointer(nskb); - memcpy(skb_put(nskb, chunk->skb->len), + pkt_size -= packet->overhead; + list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { + list_del_init(&chunk->list); + if (sctp_chunk_is_data(chunk)) { + /* 6.3.1 C4) When data is in flight and when allowed + * by rule C5, a new RTT measurement MUST be made each + * round trip. Furthermore, new RTT measurements + * SHOULD be made no more than once per round-trip + * for a given destination transport address. + */ + + if (!chunk->resent && !tp->rto_pending) { + chunk->rtt_in_progress = 1; + tp->rto_pending = 1; + } + + has_data = 1; + } + + padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; + if (padding) + memset(skb_put(chunk->skb, padding), 0, padding); + + /* if this is the auth chunk that we are adding, + * store pointer where it will be added and put + * the auth into the packet. + */ + if (chunk == packet->auth) + auth = skb_tail_pointer(nskb); + + memcpy(skb_put(nskb, chunk->skb->len), chunk->skb->data, chunk->skb->len); - pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, " - "rtt_in_progress:%d\n", chunk, - sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), - chunk->has_tsn ? "TSN" : "No TSN", - chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, - ntohs(chunk->chunk_hdr->length), chunk->skb->len, - chunk->rtt_in_progress); - - /* - * If this is a control chunk, this is our last - * reference. Free data chunks after they've been - * acknowledged or have failed. - */ - if (!sctp_chunk_is_data(chunk)) - sctp_chunk_free(chunk); - } + pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n", + chunk, + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), + chunk->has_tsn ? "TSN" : "No TSN", + chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, + ntohs(chunk->chunk_hdr->length), chunk->skb->len, + chunk->rtt_in_progress); + + /* If this is a control chunk, this is our last + * reference. Free data chunks after they've been + * acknowledged or have failed. + * Re-queue auth chunks if needed. + */ + pkt_size -= WORD_ROUND(chunk->skb->len); - /* SCTP-AUTH, Section 6.2 - * The sender MUST calculate the MAC as described in RFC2104 [2] - * using the hash function H as described by the MAC Identifier and - * the shared association key K based on the endpoint pair shared key - * described by the shared key identifier. The 'data' used for the - * computation of the AUTH-chunk is given by the AUTH chunk with its - * HMAC field set to zero (as shown in Figure 6) followed by all - * chunks that are placed after the AUTH chunk in the SCTP packet. - */ - if (auth) - sctp_auth_calculate_hmac(asoc, nskb, - (struct sctp_auth_chunk *)auth, - gfp); + if (chunk == packet->auth && !list_empty(&packet->chunk_list)) + list_add(&chunk->list, &packet->chunk_list); + else if (!sctp_chunk_is_data(chunk)) + sctp_chunk_free(chunk); + + if (!pkt_size) + break; + } + + /* SCTP-AUTH, Section 6.2 + * The sender MUST calculate the MAC as described in RFC2104 [2] + * using the hash function H as described by the MAC Identifier and + * the shared association key K based on the endpoint pair shared key + * described by the shared key identifier. The 'data' used for the + * computation of the AUTH-chunk is given by the AUTH chunk with its + * HMAC field set to zero (as shown in Figure 6) followed by all + * chunks that are placed after the AUTH chunk in the SCTP packet. + */ + if (auth) + sctp_auth_calculate_hmac(asoc, nskb, + (struct sctp_auth_chunk *)auth, + gfp); + + if (!gso) + break; + + if (skb_gro_receive(&head, nskb)) + goto nomem; + nskb = NULL; + if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >= + sk->sk_gso_max_segs)) + goto nomem; + } while (!list_empty(&packet->chunk_list)); /* 2) Calculate the Adler-32 checksum of the whole packet, * including the SCTP common header and all the @@ -532,16 +621,18 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * * Note: Adler-32 is no longer applicable, as has been replaced * by CRC32-C as described in . + * + * If it's a GSO packet, it's postponed to sctp_skb_segment. */ - if (!sctp_checksum_disable) { - if (!(dst->dev->features & NETIF_F_SCTP_CRC) || - (dst_xfrm(dst) != NULL) || packet->ipfragok) { - sh->checksum = sctp_compute_cksum(nskb, 0); + if (!sctp_checksum_disable || gso) { + if (!gso && (!(dst->dev->features & NETIF_F_SCTP_CRC) || + dst_xfrm(dst) || packet->ipfragok)) { + sh->checksum = sctp_compute_cksum(head, 0); } else { /* no need to seed pseudo checksum for SCTP */ - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = skb_transport_header(nskb) - nskb->head; - nskb->csum_offset = offsetof(struct sctphdr, checksum); + head->ip_summed = CHECKSUM_PARTIAL; + head->csum_start = skb_transport_header(head) - head->head; + head->csum_offset = offsetof(struct sctphdr, checksum); } } @@ -557,7 +648,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * Note: The works for IPv6 layer checks this bit too later * in transmission. See IP6_ECN_flow_xmit(). */ - tp->af_specific->ecn_capable(nskb->sk); + tp->af_specific->ecn_capable(sk); /* Set up the IP options. */ /* BUG: not implemented @@ -566,7 +657,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) /* Dump that on IP! */ if (asoc) { - asoc->stats.opackets++; + asoc->stats.opackets += pktcount; if (asoc->peer.last_sent_to != tp) /* Considering the multiple CPU scenario, this is a * "correcter" place for last_sent_to. --xguo @@ -589,16 +680,36 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) } } - pr_debug("***sctp_transmit_packet*** skb->len:%d\n", nskb->len); + pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len); + + if (gso) { + /* Cleanup our debris for IP stacks */ + memset(head->cb, 0, max(sizeof(struct inet_skb_parm), + sizeof(struct inet6_skb_parm))); - nskb->ignore_df = packet->ipfragok; - tp->af_specific->sctp_xmit(nskb, tp); + skb_shinfo(head)->gso_segs = pktcount; + skb_shinfo(head)->gso_size = GSO_BY_FRAGS; + + /* We have to refresh this in case we are xmiting to + * more than one transport at a time + */ + rcu_read_lock(); + if (__sk_dst_get(sk) != tp->dst) { + dst_hold(tp->dst); + sk_setup_caps(sk, tp->dst); + } + rcu_read_unlock(); + } + head->ignore_df = packet->ipfragok; + tp->af_specific->sctp_xmit(head, tp); out: sctp_packet_reset(packet); return err; no_route: - kfree_skb(nskb); + kfree_skb(head); + if (nskb != head) + kfree_skb(nskb); if (asoc) IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES); @@ -751,39 +862,63 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len) { - size_t psize; - size_t pmtu; - int too_big; + size_t psize, pmtu; sctp_xmit_t retval = SCTP_XMIT_OK; psize = packet->size; - pmtu = ((packet->transport->asoc) ? - (packet->transport->asoc->pathmtu) : - (packet->transport->pathmtu)); - - too_big = (psize + chunk_len > pmtu); + if (packet->transport->asoc) + pmtu = packet->transport->asoc->pathmtu; + else + pmtu = packet->transport->pathmtu; /* Decide if we need to fragment or resubmit later. */ - if (too_big) { - /* It's OK to fragmet at IP level if any one of the following + if (psize + chunk_len > pmtu) { + /* It's OK to fragment at IP level if any one of the following * is true: - * 1. The packet is empty (meaning this chunk is greater - * the MTU) - * 2. The chunk we are adding is a control chunk - * 3. The packet doesn't have any data in it yet and data - * requires authentication. + * 1. The packet is empty (meaning this chunk is greater + * the MTU) + * 2. The packet doesn't have any data in it yet and data + * requires authentication. */ - if (sctp_packet_empty(packet) || !sctp_chunk_is_data(chunk) || + if (sctp_packet_empty(packet) || (!packet->has_data && chunk->auth)) { /* We no longer do re-fragmentation. * Just fragment at the IP layer, if we * actually hit this condition */ packet->ipfragok = 1; - } else { - retval = SCTP_XMIT_PMTU_FULL; + goto out; } + + /* It is also okay to fragment if the chunk we are + * adding is a control chunk, but only if current packet + * is not a GSO one otherwise it causes fragmentation of + * a large frame. So in this case we allow the + * fragmentation by forcing it to be in a new packet. + */ + if (!sctp_chunk_is_data(chunk) && packet->has_data) + retval = SCTP_XMIT_PMTU_FULL; + + if (psize + chunk_len > packet->max_size) + /* Hit GSO/PMTU limit, gotta flush */ + retval = SCTP_XMIT_PMTU_FULL; + + if (!packet->transport->burst_limited && + psize + chunk_len > (packet->transport->cwnd >> 1)) + /* Do not allow a single GSO packet to use more + * than half of cwnd. + */ + retval = SCTP_XMIT_PMTU_FULL; + + if (packet->transport->burst_limited && + psize + chunk_len > (packet->transport->burst_limited >> 1)) + /* Do not allow a single GSO packet to use more + * than half of original cwnd. + */ + retval = SCTP_XMIT_PMTU_FULL; + /* Otherwise it will fit in the GSO packet */ } +out: return retval; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index d3d50daa248b..40022ee885d7 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1516,6 +1516,9 @@ static __init int sctp_init(void) if (status) goto err_v6_add_protocol; + if (sctp_offload_init() < 0) + pr_crit("%s: Cannot add SCTP protocol offload\n", __func__); + out: return status; err_v6_add_protocol: diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 67154b848aa9..712fb2339baa 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4003,6 +4003,8 @@ static int sctp_init_sock(struct sock *sk) return -ESOCKTNOSUPPORT; } + sk->sk_gso_type = SKB_GSO_SCTP; + /* Initialize default send parameters. These parameters can be * modified with the SCTP_DEFAULT_SEND_PARAM socket option. */ -- cgit v1.2.3 From 942b3235bf77e5600a05d6e85f0415bdeb8068bb Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 2 Jun 2016 15:05:44 -0300 Subject: sctp: improve debug message to also log curr pkt and new chunk size This is useful for debugging packet sizes. Signed-off-by: Marcelo Ricardo Leitner Tested-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/output.c b/net/sctp/output.c index 60499a69179d..90d2e125c2f5 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -182,7 +182,8 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, sctp_xmit_t retval; int error = 0; - pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk); + pr_debug("%s: packet:%p size:%lu chunk:%p size:%d\n", __func__, + packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { case SCTP_XMIT_PMTU_FULL: -- cgit v1.2.3 From 9b6d53985fd130c24ad2260c2edb0df50449f020 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 2 Jun 2016 12:08:52 -0700 Subject: rxrpc: Use pr_ and pr_fmt, reduce object size a few KB Use the more common kernel logging style and reduce object size. The logging message prefix changes from a mixture of "RxRPC:" and "RXRPC:" to "af_rxrpc: ". $ size net/rxrpc/built-in.o* text data bss dec hex filename 64172 1972 8304 74448 122d0 net/rxrpc/built-in.o.new 67512 1972 8304 77788 12fdc net/rxrpc/built-in.o.old Miscellanea: o Consolidate the ASSERT macros to use a single pr_err call with decimal and hexadecimal output and a stringified #OP argument Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/rxrpc/af_rxrpc.c | 18 ++++++++++-------- net/rxrpc/ar-accept.c | 2 ++ net/rxrpc/ar-ack.c | 2 ++ net/rxrpc/ar-call.c | 12 ++++++------ net/rxrpc/ar-connection.c | 2 ++ net/rxrpc/ar-connevent.c | 2 ++ net/rxrpc/ar-input.c | 2 ++ net/rxrpc/ar-internal.h | 30 ++++++++++++------------------ net/rxrpc/ar-key.c | 4 +++- net/rxrpc/ar-local.c | 2 ++ net/rxrpc/ar-output.c | 2 ++ net/rxrpc/ar-peer.c | 2 ++ net/rxrpc/ar-recvmsg.c | 4 +++- net/rxrpc/ar-skbuff.c | 2 ++ net/rxrpc/ar-transport.c | 2 ++ net/rxrpc/rxkad.c | 2 ++ 16 files changed, 56 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index e45e94ca030f..7840b8e7da80 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -796,49 +798,49 @@ static int __init af_rxrpc_init(void) "rxrpc_call_jar", sizeof(struct rxrpc_call), 0, SLAB_HWCACHE_ALIGN, NULL); if (!rxrpc_call_jar) { - printk(KERN_NOTICE "RxRPC: Failed to allocate call jar\n"); + pr_notice("Failed to allocate call jar\n"); goto error_call_jar; } rxrpc_workqueue = alloc_workqueue("krxrpcd", 0, 1); if (!rxrpc_workqueue) { - printk(KERN_NOTICE "RxRPC: Failed to allocate work queue\n"); + pr_notice("Failed to allocate work queue\n"); goto error_work_queue; } ret = rxrpc_init_security(); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot initialise security\n"); + pr_crit("Cannot initialise security\n"); goto error_security; } ret = proto_register(&rxrpc_proto, 1); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot register protocol\n"); + pr_crit("Cannot register protocol\n"); goto error_proto; } ret = sock_register(&rxrpc_family_ops); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot register socket family\n"); + pr_crit("Cannot register socket family\n"); goto error_sock; } ret = register_key_type(&key_type_rxrpc); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot register client key type\n"); + pr_crit("Cannot register client key type\n"); goto error_key_type; } ret = register_key_type(&key_type_rxrpc_s); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot register server key type\n"); + pr_crit("Cannot register server key type\n"); goto error_key_type_s; } ret = rxrpc_sysctl_init(); if (ret < 0) { - printk(KERN_CRIT "RxRPC: Cannot register sysctls\n"); + pr_crit("Cannot register sysctls\n"); goto error_sysctls; } diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c index e7a7f05f13e2..eea5f4a5d8b1 100644 --- a/net/rxrpc/ar-accept.c +++ b/net/rxrpc/ar-accept.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index 374478e006e7..18381783c2b1 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c index 571a41fd5a32..1fbaae1cba5f 100644 --- a/net/rxrpc/ar-call.c +++ b/net/rxrpc/ar-call.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -669,8 +671,7 @@ void rxrpc_release_call(struct rxrpc_call *call) conn->channels[3] == NULL); break; default: - printk(KERN_ERR "RxRPC: conn->avail_calls=%d\n", - conn->avail_calls); + pr_err("conn->avail_calls=%d\n", conn->avail_calls); BUG(); } } @@ -935,16 +936,15 @@ void __exit rxrpc_destroy_all_calls(void) if (call->state != RXRPC_CALL_DEAD) break; default: - printk(KERN_ERR "RXRPC:" - " Call %p still in use (%d,%d,%s,%lx,%lx)!\n", + pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n", call, atomic_read(&call->usage), atomic_read(&call->ackr_not_idle), rxrpc_call_states[call->state], call->flags, call->events); if (!skb_queue_empty(&call->rx_queue)) - printk(KERN_ERR"RXRPC: Rx queue occupied\n"); + pr_err("Rx queue occupied\n"); if (!skb_queue_empty(&call->rx_oos_queue)) - printk(KERN_ERR"RXRPC: OOS queue occupied\n"); + pr_err("OOS queue occupied\n"); break; } diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 97f4fae74bca..d67b1f1b5001 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c index 5f9563968a5b..8bdd692d4862 100644 --- a/net/rxrpc/ar-connevent.c +++ b/net/rxrpc/ar-connevent.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 6ff97412a0bb..d7c2a0bc839e 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index f0b807a163fa..18ab5c50ba87 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -744,21 +744,18 @@ do { \ #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) #define ASSERTCMP(X, OP, Y) \ do { \ - if (unlikely(!((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "RxRPC: Assertion failed\n"); \ - printk(KERN_ERR "%lu " #OP " %lu is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ + unsigned long _x = (unsigned long)(X); \ + unsigned long _y = (unsigned long)(Y); \ + if (unlikely(!(_x OP _y))) { \ + pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ + _x, _x, #OP, _y, _y); \ BUG(); \ } \ } while (0) @@ -766,21 +763,18 @@ do { \ #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ - if (unlikely((C) && !((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "RxRPC: Assertion failed\n"); \ - printk(KERN_ERR "%lu " #OP " %lu is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ + unsigned long _x = (unsigned long)(X); \ + unsigned long _y = (unsigned long)(Y); \ + if (unlikely((C) && !(_x OP _y))) { \ + pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ + _x, _x, #OP, _y, _y); \ BUG(); \ } \ } while (0) diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 1021b4c0bdd2..4ad56fafe3a7 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -12,6 +12,8 @@ * "afs@CAMBRIDGE.REDHAT.COM> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -800,7 +802,7 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token) rxrpc_rxk5_free(token->k5); break; default: - printk(KERN_ERR "Unknown token type %x on rxrpc key\n", + pr_err("Unknown token type %x on rxrpc key\n", token->security_index); BUG(); } diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c index 4e1e6db0050b..701c42b7050e 100644 --- a/net/rxrpc/ar-local.c +++ b/net/rxrpc/ar-local.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 51cb10062a8d..ea619535f0ed 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c index dc089b1976aa..0b54cda3d8e5 100644 --- a/net/rxrpc/ar-peer.c +++ b/net/rxrpc/ar-peer.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 160f0927aa3e..59706b9f2f7a 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -307,7 +309,7 @@ receive_non_data_message: &abort_code); break; default: - pr_err("RxRPC: Unknown packet mark %u\n", skb->mark); + pr_err("Unknown packet mark %u\n", skb->mark); BUG(); break; } diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c index 62a267472fce..eee0cfd9ac8c 100644 --- a/net/rxrpc/ar-skbuff.c +++ b/net/rxrpc/ar-skbuff.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c index 66a1a5676446..a1b65183b07d 100644 --- a/net/rxrpc/ar-transport.c +++ b/net/rxrpc/ar-transport.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 6b726a046a7d..d4da538b3d3c 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include -- cgit v1.2.3 From 3b55a537d028ccc3e423e74a2037476318918341 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 3 Jun 2016 22:53:26 -0700 Subject: sctp: Fix warning in sctp_packet_transmit_chunk() size_t objects should be printed with %Z printf format. Reported-by: kbuild test robot Signed-off-by: David S. Miller --- net/sctp/output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/output.c b/net/sctp/output.c index 90d2e125c2f5..1541a91d6d9d 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -182,7 +182,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, sctp_xmit_t retval; int error = 0; - pr_debug("%s: packet:%p size:%lu chunk:%p size:%d\n", __func__, + pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__, packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { -- cgit v1.2.3 From 76f21b99004ef1f16be6184678f660eab911b8b8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 3 Jun 2016 22:56:28 -0700 Subject: net: Add docbook description for 'mtu' arg to skb_gso_validate_mtu() Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b6e0f95bef36..e7ec6d3ad5f0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4399,6 +4399,7 @@ EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); * skb_gso_validate_mtu - Return in case such skb fits a given MTU * * @skb: GSO skb + * @mtu: MTU to validate against * * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU * once split. -- cgit v1.2.3 From 0e5760440842eee57ff573251e95289e2ee7b15f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:16:52 +0200 Subject: net: dsa: slave: chip data is optional, don't dereference NULL The new binding does not make use of dsa_chip_data, a.k.a cd. When retrieving the size of the EEPROM attached to a switch, don't assume there is a cd attached to the switch structure. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 152436cdab30..135a91706755 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -615,7 +615,7 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->cd->eeprom_len) + if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; if (ds->drv->get_eeprom_len) -- cgit v1.2.3 From 6e8e862ded41bd966b088960ad6f4dc7a1a3ce36 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:16:55 +0200 Subject: net: dsa: slave: Remove MDIO address from switch MDIO bus name The DSA layer should no longer assume the switch is connected to an MDIO bus. As a result, we cannot use the address on the MDIO bus when forming the name of the switches internal MDIO bus for its builtin and possibly external PHYs. The switch index is sufficient to make the name unique, so drop the MDIO address. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/slave.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 135a91706755..f640a48a6ff3 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -49,8 +49,7 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->name = "dsa slave smi"; ds->slave_mii_bus->read = dsa_slave_phy_read; ds->slave_mii_bus->write = dsa_slave_phy_write; - snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x", - ds->index, ds->cd->sw_addr); + snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d", ds->index); ds->slave_mii_bus->parent = ds->dev; ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask; } -- cgit v1.2.3 From 149cafd790573294752787f5ce1dc0f99e4088d3 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:16:56 +0200 Subject: net: dsa: tag_{e}dsa.c: Remove dependency on platform data The platform data nr_chips is used when validating a received packet, to ensure it comes from a know switch chip. The number of possible switches is limited to DSA_MAX_SWITCHES, so use this as the first validation step. The new binding allows holes in the dst->ds[] array, so also ensure ensure there is a valid dsa_switch for this packet. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/tag_dsa.c | 6 +++++- net/dsa/tag_edsa.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index aa780e4ac0bd..f9832f097681 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -107,9 +107,13 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, * Check that the source device exists and that the source * port is a registered DSA port. */ - if (source_device >= dst->pd->nr_chips) + if (source_device >= DSA_MAX_SWITCHES) goto out_drop; + ds = dst->ds[source_device]; + if (!ds) + goto out_drop; + if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) goto out_drop; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 2288c8098c42..3890aac8190f 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -120,9 +120,13 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, * Check that the source device exists and that the source * port is a registered DSA port. */ - if (source_device >= dst->pd->nr_chips) + if (source_device >= DSA_MAX_SWITCHES) goto out_drop; + ds = dst->ds[source_device]; + if (!ds) + goto out_drop; + if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) goto out_drop; -- cgit v1.2.3 From c8b098086b4c744084350d2757a637ad756adf34 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:16:57 +0200 Subject: net: dsa: Add a ports structure and use it in the switch structure There are going to be more per-port members added to the switch structure. So add a port structure and move the netdev into it. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa.c | 8 ++++---- net/dsa/slave.c | 4 ++-- net/dsa/tag_brcm.c | 4 ++-- net/dsa/tag_dsa.c | 4 ++-- net/dsa/tag_edsa.c | 4 ++-- net/dsa/tag_trailer.c | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index eff5dfc2e33f..18086e0cc617 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -437,10 +437,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds) if (!(ds->enabled_port_mask & (1 << port))) continue; - if (!ds->ports[port]) + if (!ds->ports[port].netdev) continue; - dsa_slave_destroy(ds->ports[port]); + dsa_slave_destroy(ds->ports[port].netdev); } /* Remove any fixed link PHYs */ @@ -469,7 +469,7 @@ static int dsa_switch_suspend(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_suspend(ds->ports[i]); + ret = dsa_slave_suspend(ds->ports[i].netdev); if (ret) return ret; } @@ -495,7 +495,7 @@ static int dsa_switch_resume(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_resume(ds->ports[i]); + ret = dsa_slave_resume(ds->ports[i].netdev); if (ret) return ret; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index f640a48a6ff3..169abacbc6ce 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1183,12 +1183,12 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, p->old_link = -1; p->old_duplex = -1; - ds->ports[port] = slave_dev; + ds->ports[port].netdev = slave_dev; ret = register_netdev(slave_dev); if (ret) { netdev_err(master, "error %d registering interface %s\n", ret, slave_dev->name); - ds->ports[port] = NULL; + ds->ports[port].netdev = NULL; free_netdev(slave_dev); return ret; } diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index e2aadb73111d..21bffde6e4bf 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -127,7 +127,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, source_port = brcm_tag[3] & BRCM_EG_PID_MASK; /* Validate port against switch setup, either the port is totally */ - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; /* Remove Broadcom tag and update checksum */ @@ -140,7 +140,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb->protocol = eth_type_trans(skb, skb->dev); skb->dev->stats.rx_packets++; diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index f9832f097681..bce79ffe342b 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -114,7 +114,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, if (!ds) goto out_drop; - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; /* @@ -163,7 +163,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 3890aac8190f..6c1720e88537 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -127,7 +127,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, if (!ds) goto out_drop; - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; /* @@ -182,7 +182,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index b6ca0890d018..5e3903eb1afa 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -82,12 +82,12 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, goto out_drop; source_port = trailer[1] & 7; - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; pskb_trim_rcsum(skb, skb->len - 4); - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); -- cgit v1.2.3 From 189b0d93ec61e1f991e96d7bc03b03cf929d164c Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:16:58 +0200 Subject: net: dsa: Move port device node into port structure Move the port device node structure into the port structure, from the chip data. This information is needed in the next step of implementing the new binding. The chip data structure is used while parsing the whole old binding, before the individual switch structures exist. With the new bindings, this is reversed, the switches exist first, and the interconnections between the switches is derived from the individual switch bindings. Thus this chip data structure becomes unneeded. Signed-off-by: Andrew Lunn eviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa.c | 8 ++++---- net/dsa/slave.c | 5 ++--- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 18086e0cc617..5907f8cd13b6 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -182,7 +182,6 @@ __ATTRIBUTE_GROUPS(dsa_hwmon); /* basic switch operations **************************************************/ static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) { - struct dsa_chip_data *cd = ds->cd; struct device_node *port_dn; struct phy_device *phydev; int ret, port, mode; @@ -191,7 +190,7 @@ static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; - port_dn = cd->port_dn[port]; + port_dn = ds->ports[port].dn; if (of_phy_is_fixed_link(port_dn)) { ret = of_phy_register_fixed_link(port_dn); if (ret) { @@ -325,6 +324,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * Create network devices for physical switch ports. */ for (i = 0; i < DSA_MAX_PORTS; i++) { + ds->ports[i].dn = cd->port_dn[i]; + if (!(ds->enabled_port_mask & (1 << i))) continue; @@ -424,7 +425,6 @@ static void dsa_switch_destroy(struct dsa_switch *ds) { struct device_node *port_dn; struct phy_device *phydev; - struct dsa_chip_data *cd = ds->cd; int port; #ifdef CONFIG_NET_DSA_HWMON @@ -445,7 +445,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) /* Remove any fixed link PHYs */ for (port = 0; port < DSA_MAX_PORTS; port++) { - port_dn = cd->port_dn[port]; + port_dn = ds->ports[port].dn; if (of_phy_is_fixed_link(port_dn)) { phydev = of_phy_find_device(port_dn); if (phydev) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 169abacbc6ce..52f1183c42a0 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -998,13 +998,12 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, struct net_device *slave_dev) { struct dsa_switch *ds = p->parent; - struct dsa_chip_data *cd = ds->cd; struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; u32 phy_flags = 0; int mode, ret; - port_dn = cd->port_dn[p->port]; + port_dn = ds->ports[p->port].dn; mode = of_get_phy_mode(port_dn); if (mode < 0) mode = PHY_INTERFACE_MODE_NA; @@ -1146,7 +1145,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, NULL); SET_NETDEV_DEV(slave_dev, parent); - slave_dev->dev.of_node = ds->cd->port_dn[port]; + slave_dev->dev.of_node = ds->ports[port].dn; slave_dev->vlan_features = master->vlan_features; p = netdev_priv(slave_dev); -- cgit v1.2.3 From 4a7704ffa86705b0580b6473c407b7b7618e072d Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:16:59 +0200 Subject: net: dsa: Remove dynamic allocate of routing table With a maximum of four switches, the size of the routing table is the same as the pointer to it. Removing it makes the code simpler. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 5907f8cd13b6..6177dd750847 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -587,17 +587,6 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, if (link_sw_addr >= pd->nr_chips) return -EINVAL; - /* First time routing table allocation */ - if (!cd->rtable) { - cd->rtable = kmalloc_array(pd->nr_chips, sizeof(s8), - GFP_KERNEL); - if (!cd->rtable) - return -ENOMEM; - - /* default to no valid uplink/downlink */ - memset(cd->rtable, -1, pd->nr_chips * sizeof(s8)); - } - cd->rtable[link_sw_addr] = port_index; return 0; @@ -639,7 +628,6 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) kfree(pd->chip[i].port_names[port_index]); port_index++; } - kfree(pd->chip[i].rtable); /* Drop our reference to the MDIO bus device */ if (pd->chip[i].host_dev) -- cgit v1.2.3 From 66472fc04e8be62858f29c7798ed17e984c1ab3b Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:17:00 +0200 Subject: net: dsa: Copy the routing table into the switch structure The new binding will not have a chip data structure, it will place the routing directly into the switch structure. To enable backwards compatibility, copy the routing from the chip data into the switch structure. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6177dd750847..bfe1d03d4730 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -297,6 +297,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) dst->tag_protocol = drv->tag_protocol; } + memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); + /* * Do basic register setup. */ -- cgit v1.2.3 From 9b8e895c4e9d217dfa0e48aafa072258e2a3480e Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:17:01 +0200 Subject: net: dsa: Split up creating/destroying of DSA and CPU ports Refactor the code to setup a single DSA/CPU port into a function of its own, and export it, so it can be used by the new binding. Similarly, refactor the destroy code into a function. When destroying the ports, don't put the of node. They should be released at the end along with the normal ports. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa.c | 86 ++++++++++++++++++++++++++++++++---------------------- net/dsa/dsa_priv.h | 3 ++ 2 files changed, 54 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index bfe1d03d4730..7140de475c07 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -180,36 +180,47 @@ __ATTRIBUTE_GROUPS(dsa_hwmon); #endif /* CONFIG_NET_DSA_HWMON */ /* basic switch operations **************************************************/ -static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) +int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, + struct device_node *port_dn, int port) { - struct device_node *port_dn; struct phy_device *phydev; - int ret, port, mode; + int ret, mode; + + if (of_phy_is_fixed_link(port_dn)) { + ret = of_phy_register_fixed_link(port_dn); + if (ret) { + dev_err(dev, "failed to register fixed PHY\n"); + return ret; + } + phydev = of_phy_find_device(port_dn); + + mode = of_get_phy_mode(port_dn); + if (mode < 0) + mode = PHY_INTERFACE_MODE_NA; + phydev->interface = mode; + + genphy_config_init(phydev); + genphy_read_status(phydev); + if (ds->drv->adjust_link) + ds->drv->adjust_link(ds, port, phydev); + } + + return 0; +} + +static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev) +{ + struct device_node *port_dn; + int ret, port; for (port = 0; port < DSA_MAX_PORTS; port++) { if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; port_dn = ds->ports[port].dn; - if (of_phy_is_fixed_link(port_dn)) { - ret = of_phy_register_fixed_link(port_dn); - if (ret) { - netdev_err(master, - "failed to register fixed PHY\n"); - return ret; - } - phydev = of_phy_find_device(port_dn); - - mode = of_get_phy_mode(port_dn); - if (mode < 0) - mode = PHY_INTERFACE_MODE_NA; - phydev->interface = mode; - - genphy_config_init(phydev); - genphy_read_status(phydev); - if (ds->drv->adjust_link) - ds->drv->adjust_link(ds, port, phydev); - } + ret = dsa_cpu_dsa_setup(ds, dev, port_dn, port); + if (ret) + return ret; } return 0; } @@ -340,7 +351,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) } /* Perform configuration of the CPU and DSA ports */ - ret = dsa_cpu_dsa_setup(ds, dst->master_netdev); + ret = dsa_cpu_dsa_setups(ds, parent); if (ret < 0) { netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", index); @@ -423,10 +434,21 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, return ds; } -static void dsa_switch_destroy(struct dsa_switch *ds) +void dsa_cpu_dsa_destroy(struct device_node *port_dn) { - struct device_node *port_dn; struct phy_device *phydev; + + if (of_phy_is_fixed_link(port_dn)) { + phydev = of_phy_find_device(port_dn); + if (phydev) { + phy_device_free(phydev); + fixed_phy_unregister(phydev); + } + } +} + +static void dsa_switch_destroy(struct dsa_switch *ds) +{ int port; #ifdef CONFIG_NET_DSA_HWMON @@ -445,17 +467,11 @@ static void dsa_switch_destroy(struct dsa_switch *ds) dsa_slave_destroy(ds->ports[port].netdev); } - /* Remove any fixed link PHYs */ + /* Disable configuration of the CPU and DSA ports */ for (port = 0; port < DSA_MAX_PORTS; port++) { - port_dn = ds->ports[port].dn; - if (of_phy_is_fixed_link(port_dn)) { - phydev = of_phy_find_device(port_dn); - if (phydev) { - phy_device_free(phydev); - of_node_put(port_dn); - fixed_phy_unregister(phydev); - } - } + if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) + continue; + dsa_cpu_dsa_destroy(ds->ports[port].dn); } mdiobus_unregister(ds->slave_mii_bus); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index dfa33779d49c..dbea5d9e7f75 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -50,6 +50,9 @@ struct dsa_slave_priv { /* dsa.c */ extern char dsa_driver_version[]; +int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, + struct device_node *port_dn, int port); +void dsa_cpu_dsa_destroy(struct device_node *port_dn); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; -- cgit v1.2.3 From 39a7f2a4eb496c0c68cc93fcb403190b48605168 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:17:03 +0200 Subject: net: dsa: Refactor selection of tag ops into a function Replace the two switch statements with an array lookup, and store the result in the dsa tree structure. The drivers no longer need to know the selected tag protocol, so remove it from the dsa switch structure. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa.c | 71 ++++++++++++++++++++++++++++++++++-------------------- net/dsa/dsa_priv.h | 1 + net/dsa/slave.c | 35 +-------------------------- 3 files changed, 47 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 7140de475c07..221ebde4318d 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -29,6 +29,33 @@ char dsa_driver_version[] = "0.1"; +static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* Just return the original SKB */ + return skb; +} + +static const struct dsa_device_ops none_ops = { + .xmit = dsa_slave_notag_xmit, + .rcv = NULL, +}; + +const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { +#ifdef CONFIG_NET_DSA_TAG_DSA + [DSA_TAG_PROTO_DSA] = &dsa_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER + [DSA_TAG_PROTO_TRAILER] = &trailer_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM + [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, +#endif + [DSA_TAG_PROTO_NONE] = &none_ops, +}; /* switch driver registration ***********************************************/ static DEFINE_MUTEX(dsa_switch_drivers_mutex); @@ -225,6 +252,20 @@ static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev) return 0; } +const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) +{ + const struct dsa_device_ops *ops; + + if (tag_protocol >= DSA_TAG_LAST) + return ERR_PTR(-EINVAL); + ops = dsa_device_ops[tag_protocol]; + + if (!ops) + return ERR_PTR(-ENOPROTOOPT); + + return ops; +} + static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { struct dsa_switch_driver *drv = ds->drv; @@ -277,35 +318,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * switch. */ if (dst->cpu_switch == index) { - switch (drv->tag_protocol) { -#ifdef CONFIG_NET_DSA_TAG_DSA - case DSA_TAG_PROTO_DSA: - dst->rcv = dsa_netdev_ops.rcv; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - case DSA_TAG_PROTO_EDSA: - dst->rcv = edsa_netdev_ops.rcv; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - case DSA_TAG_PROTO_TRAILER: - dst->rcv = trailer_netdev_ops.rcv; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_BRCM - case DSA_TAG_PROTO_BRCM: - dst->rcv = brcm_netdev_ops.rcv; - break; -#endif - case DSA_TAG_PROTO_NONE: - break; - default: - ret = -ENOPROTOOPT; + dst->tag_ops = dsa_resolve_tag_protocol(drv->tag_protocol); + if (IS_ERR(dst->tag_ops)) { + ret = PTR_ERR(dst->tag_ops); goto out; } - dst->tag_protocol = drv->tag_protocol; + dst->rcv = dst->tag_ops->rcv; } memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index dbea5d9e7f75..72f7b8989cfb 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -53,6 +53,7 @@ extern char dsa_driver_version[]; int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, struct device_node *port_dn, int port); void dsa_cpu_dsa_destroy(struct device_node *port_dn); +const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 52f1183c42a0..35e5f0f6688b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -521,14 +521,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, - struct net_device *dev) -{ - /* Just return the original SKB */ - return skb; -} - - /* ethtool operations *******************************************************/ static int dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) @@ -1151,32 +1143,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, p = netdev_priv(slave_dev); p->parent = ds; p->port = port; - - switch (ds->dst->tag_protocol) { -#ifdef CONFIG_NET_DSA_TAG_DSA - case DSA_TAG_PROTO_DSA: - p->xmit = dsa_netdev_ops.xmit; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - case DSA_TAG_PROTO_EDSA: - p->xmit = edsa_netdev_ops.xmit; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - case DSA_TAG_PROTO_TRAILER: - p->xmit = trailer_netdev_ops.xmit; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_BRCM - case DSA_TAG_PROTO_BRCM: - p->xmit = brcm_netdev_ops.xmit; - break; -#endif - default: - p->xmit = dsa_slave_notag_xmit; - break; - } + p->xmit = dst->tag_ops->xmit; p->old_pause = -1; p->old_link = -1; -- cgit v1.2.3 From e755e49eb3ea925834006c294e989df52f592580 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:17:04 +0200 Subject: net: dsa: Make mdio bus optional The switch may want to instantiate its own MDIO bus. Only do it centrally if the switch has not already created one, and the read op is implemented. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 221ebde4318d..6c314f300424 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -340,17 +340,18 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) if (ret < 0) goto out; - ds->slave_mii_bus = devm_mdiobus_alloc(parent); - if (ds->slave_mii_bus == NULL) { - ret = -ENOMEM; - goto out; - } - dsa_slave_mii_bus_init(ds); - - ret = mdiobus_register(ds->slave_mii_bus); - if (ret < 0) - goto out; + if (!ds->slave_mii_bus && drv->phy_read) { + ds->slave_mii_bus = devm_mdiobus_alloc(parent); + if (!ds->slave_mii_bus) { + ret = -ENOMEM; + goto out; + } + dsa_slave_mii_bus_init(ds); + ret = mdiobus_register(ds->slave_mii_bus); + if (ret < 0) + goto out; + } /* * Create network devices for physical switch ports. @@ -493,7 +494,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds) dsa_cpu_dsa_destroy(ds->ports[port].dn); } - mdiobus_unregister(ds->slave_mii_bus); + if (ds->slave_mii_bus && ds->drv->phy_read) + mdiobus_unregister(ds->slave_mii_bus); } #ifdef CONFIG_PM_SLEEP -- cgit v1.2.3 From 83c0afaec7b730b16c518aecc8e6246ec91b265e Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:17:07 +0200 Subject: net: dsa: Add new binding implementation The existing DSA binding has a number of limitations and problems. The main problem is that it cannot represent a switch as a linux device, hanging off some bus. It is limited to one CPU port. The DSA platform device is artificial, and does not really represent hardware. Implement a new binding which can be embedded into any type of node on a bus to represent one switch device, and its links to other switches. Signed-off-by: Andrew Lunn Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/Makefile | 2 +- net/dsa/dsa.c | 5 + net/dsa/dsa2.c | 654 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/dsa_priv.h | 2 +- net/dsa/slave.c | 8 +- 5 files changed, 667 insertions(+), 4 deletions(-) create mode 100644 net/dsa/dsa2.c (limited to 'net') diff --git a/net/dsa/Makefile b/net/dsa/Makefile index da06ed1df620..8af4ded70f1c 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,6 +1,6 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o slave.o +dsa_core-y += dsa.o slave.o dsa2.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6c314f300424..ce3b942dce76 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -294,6 +294,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) } dst->cpu_switch = index; dst->cpu_port = i; + ds->cpu_port_mask |= 1 << i; } else if (!strcmp(name, "dsa")) { ds->dsa_port_mask |= 1 << i; } else { @@ -492,6 +493,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds) if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; dsa_cpu_dsa_destroy(ds->ports[port].dn); + + /* Clearing a bit which is not set does no harm */ + ds->cpu_port_mask |= ~(1 << port); + ds->dsa_port_mask |= ~(1 << port); } if (ds->slave_mii_bus && ds->drv->phy_read) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c new file mode 100644 index 000000000000..80dfe08db825 --- /dev/null +++ b/net/dsa/dsa2.c @@ -0,0 +1,654 @@ +/* + * net/dsa/dsa2.c - Hardware switch handling, binding version 2 + * Copyright (c) 2008-2009 Marvell Semiconductor + * Copyright (c) 2013 Florian Fainelli + * Copyright (c) 2016 Andrew Lunn + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "dsa_priv.h" + +static LIST_HEAD(dsa_switch_trees); +static DEFINE_MUTEX(dsa2_mutex); + +static struct dsa_switch_tree *dsa_get_dst(u32 tree) +{ + struct dsa_switch_tree *dst; + + list_for_each_entry(dst, &dsa_switch_trees, list) + if (dst->tree == tree) + return dst; + return NULL; +} + +static void dsa_free_dst(struct kref *ref) +{ + struct dsa_switch_tree *dst = container_of(ref, struct dsa_switch_tree, + refcount); + + list_del(&dst->list); + kfree(dst); +} + +static void dsa_put_dst(struct dsa_switch_tree *dst) +{ + kref_put(&dst->refcount, dsa_free_dst); +} + +static struct dsa_switch_tree *dsa_add_dst(u32 tree) +{ + struct dsa_switch_tree *dst; + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return NULL; + dst->tree = tree; + dst->cpu_switch = -1; + INIT_LIST_HEAD(&dst->list); + list_add_tail(&dsa_switch_trees, &dst->list); + kref_init(&dst->refcount); + + return dst; +} + +static void dsa_dst_add_ds(struct dsa_switch_tree *dst, + struct dsa_switch *ds, u32 index) +{ + kref_get(&dst->refcount); + dst->ds[index] = ds; +} + +static void dsa_dst_del_ds(struct dsa_switch_tree *dst, + struct dsa_switch *ds, u32 index) +{ + dst->ds[index] = NULL; + kref_put(&dst->refcount, dsa_free_dst); +} + +static bool dsa_port_is_dsa(struct device_node *port) +{ + const char *name; + + name = of_get_property(port, "label", NULL); + if (!name) + return false; + + if (!strcmp(name, "dsa")) + return true; + + return false; +} + +static bool dsa_port_is_cpu(struct device_node *port) +{ + const char *name; + + name = of_get_property(port, "label", NULL); + if (!name) + return false; + + if (!strcmp(name, "cpu")) + return true; + + return false; +} + +static bool dsa_ds_find_port(struct dsa_switch *ds, + struct device_node *port) +{ + u32 index; + + for (index = 0; index < DSA_MAX_PORTS; index++) + if (ds->ports[index].dn == port) + return true; + return false; +} + +static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst, + struct device_node *port) +{ + struct dsa_switch *ds; + u32 index; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + if (dsa_ds_find_port(ds, port)) + return ds; + } + + return NULL; +} + +static int dsa_port_complete(struct dsa_switch_tree *dst, + struct dsa_switch *src_ds, + struct device_node *port, + u32 src_port) +{ + struct device_node *link; + int index; + struct dsa_switch *dst_ds; + + for (index = 0;; index++) { + link = of_parse_phandle(port, "link", index); + if (!link) + break; + + dst_ds = dsa_dst_find_port(dst, link); + of_node_put(link); + + if (!dst_ds) + return 1; + + src_ds->rtable[dst_ds->index] = src_port; + } + + return 0; +} + +/* A switch is complete if all the DSA ports phandles point to ports + * known in the tree. A return value of 1 means the tree is not + * complete. This is not an error condition. A value of 0 is + * success. + */ +static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (!dsa_port_is_dsa(port)) + continue; + + err = dsa_port_complete(dst, ds, port, index); + if (err != 0) + return err; + + ds->dsa_port_mask |= BIT(index); + } + + return 0; +} + +/* A tree is complete if all the DSA ports phandles point to ports + * known in the tree. A return value of 1 means the tree is not + * complete. This is not an error condition. A value of 0 is + * success. + */ +static int dsa_dst_complete(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + err = dsa_ds_complete(dst, ds); + if (err != 0) + return err; + } + + return 0; +} + +static int dsa_dsa_port_apply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + int err; + + err = dsa_cpu_dsa_setup(ds, ds->dev, port, index); + if (err) { + dev_warn(ds->dev, "Failed to setup dsa port %d: %d\n", + index, err); + return err; + } + + return 0; +} + +static void dsa_dsa_port_unapply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + dsa_cpu_dsa_destroy(port); +} + +static int dsa_cpu_port_apply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + int err; + + err = dsa_cpu_dsa_setup(ds, ds->dev, port, index); + if (err) { + dev_warn(ds->dev, "Failed to setup cpu port %d: %d\n", + index, err); + return err; + } + + ds->cpu_port_mask |= BIT(index); + + return 0; +} + +static void dsa_cpu_port_unapply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + dsa_cpu_dsa_destroy(port); + ds->cpu_port_mask &= ~BIT(index); + +} + +static int dsa_user_port_apply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + const char *name; + int err; + + name = of_get_property(port, "label", NULL); + + err = dsa_slave_create(ds, ds->dev, index, name); + if (err) { + dev_warn(ds->dev, "Failed to create slave %d: %d\n", + index, err); + return err; + } + + return 0; +} + +static void dsa_user_port_unapply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + if (ds->ports[index].netdev) { + dsa_slave_destroy(ds->ports[index].netdev); + ds->ports[index].netdev = NULL; + } +} + +static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + int err; + + err = ds->drv->setup(ds); + if (err < 0) + return err; + + err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); + if (err < 0) + return err; + + err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); + if (err < 0) + return err; + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (dsa_port_is_dsa(port)) { + err = dsa_dsa_port_apply(port, index, ds); + if (err) + return err; + continue; + } + + if (dsa_port_is_cpu(port)) { + err = dsa_cpu_port_apply(port, index, ds); + if (err) + return err; + continue; + } + + err = dsa_user_port_apply(port, index, ds); + if (err) + continue; + } + + return 0; +} + +static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (dsa_port_is_dsa(port)) { + dsa_dsa_port_unapply(port, index, ds); + continue; + } + + if (dsa_port_is_cpu(port)) { + dsa_cpu_port_unapply(port, index, ds); + continue; + } + + dsa_user_port_unapply(port, index, ds); + } +} + +static int dsa_dst_apply(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + err = dsa_ds_apply(dst, ds); + if (err) + return err; + } + + /* If we use a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point on get + * sent to the tag format's receive function. + */ + wmb(); + dst->master_netdev->dsa_ptr = (void *)dst; + dst->applied = true; + + return 0; +} + +static void dsa_dst_unapply(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + + if (!dst->applied) + return; + + dst->master_netdev->dsa_ptr = NULL; + + /* If we used a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point get sent + * without the tag and go through the regular receive path. + */ + wmb(); + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + dsa_ds_unapply(dst, ds); + } + + pr_info("DSA: tree %d unapplied\n", dst->tree); + dst->applied = false; +} + +static int dsa_cpu_parse(struct device_node *port, u32 index, + struct dsa_switch_tree *dst, + struct dsa_switch *ds) +{ + struct net_device *ethernet_dev; + struct device_node *ethernet; + + ethernet = of_parse_phandle(port, "ethernet", 0); + if (!ethernet) + return -EINVAL; + + ethernet_dev = of_find_net_device_by_node(ethernet); + if (!ethernet_dev) + return -EPROBE_DEFER; + + if (!ds->master_netdev) + ds->master_netdev = ethernet_dev; + + if (!dst->master_netdev) + dst->master_netdev = ethernet_dev; + + if (dst->cpu_switch == -1) { + dst->cpu_switch = ds->index; + dst->cpu_port = index; + } + + dst->tag_ops = dsa_resolve_tag_protocol(ds->drv->tag_protocol); + if (IS_ERR(dst->tag_ops)) { + dev_warn(ds->dev, "No tagger for this switch\n"); + return PTR_ERR(dst->tag_ops); + } + + dst->rcv = dst->tag_ops->rcv; + + return 0; +} + +static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (dsa_port_is_cpu(port)) { + err = dsa_cpu_parse(port, index, dst, ds); + if (err) + return err; + } + } + + pr_info("DSA: switch %d %d parsed\n", dst->tree, ds->index); + + return 0; +} + +static int dsa_dst_parse(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + err = dsa_ds_parse(dst, ds); + if (err) + return err; + } + + if (!dst->master_netdev) { + pr_warn("Tree has no master device\n"); + return -EINVAL; + } + + pr_info("DSA: tree %d parsed\n", dst->tree); + + return 0; +} + +static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds) +{ + struct device_node *port; + int err; + u32 reg; + + for_each_available_child_of_node(ports, port) { + err = of_property_read_u32(port, "reg", ®); + if (err) + return err; + + if (reg >= DSA_MAX_PORTS) + return -EINVAL; + + ds->ports[reg].dn = port; + } + + return 0; +} + +static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index) +{ + int err; + + *tree = *index = 0; + + err = of_property_read_u32_index(np, "dsa,member", 0, tree); + if (err) { + /* Does not exist, but it is optional */ + if (err == -EINVAL) + return 0; + return err; + } + + err = of_property_read_u32_index(np, "dsa,member", 1, index); + if (err) + return err; + + if (*index >= DSA_MAX_SWITCHES) + return -EINVAL; + + return 0; +} + +static struct device_node *dsa_get_ports(struct dsa_switch *ds, + struct device_node *np) +{ + struct device_node *ports; + + ports = of_get_child_by_name(np, "ports"); + if (!ports) { + dev_err(ds->dev, "no ports child node found\n"); + return ERR_PTR(-EINVAL); + } + + return ports; +} + +static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) +{ + struct device_node *ports = dsa_get_ports(ds, np); + struct dsa_switch_tree *dst; + u32 tree, index; + int err; + + err = dsa_parse_member(np, &tree, &index); + if (err) + return err; + + if (IS_ERR(ports)) + return PTR_ERR(ports); + + err = dsa_parse_ports_dn(ports, ds); + if (err) + return err; + + dst = dsa_get_dst(tree); + if (!dst) { + dst = dsa_add_dst(tree); + if (!dst) + return -ENOMEM; + } + + if (dst->ds[index]) { + err = -EBUSY; + goto out; + } + + ds->dst = dst; + ds->index = index; + dsa_dst_add_ds(dst, ds, index); + + err = dsa_dst_complete(dst); + if (err < 0) + goto out_del_dst; + + if (err == 1) { + /* Not all switches registered yet */ + err = 0; + goto out; + } + + if (dst->applied) { + pr_info("DSA: Disjoint trees?\n"); + return -EINVAL; + } + + err = dsa_dst_parse(dst); + if (err) + goto out_del_dst; + + err = dsa_dst_apply(dst); + if (err) { + dsa_dst_unapply(dst); + goto out_del_dst; + } + + dsa_put_dst(dst); + return 0; + +out_del_dst: + dsa_dst_del_ds(dst, ds, ds->index); +out: + dsa_put_dst(dst); + + return err; +} + +int dsa_register_switch(struct dsa_switch *ds, struct device_node *np) +{ + int err; + + mutex_lock(&dsa2_mutex); + err = _dsa_register_switch(ds, np); + mutex_unlock(&dsa2_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(dsa_register_switch); + +void _dsa_unregister_switch(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst = ds->dst; + + dsa_dst_unapply(dst); + + dsa_dst_del_ds(dst, ds, ds->index); +} + +void dsa_unregister_switch(struct dsa_switch *ds) +{ + mutex_lock(&dsa2_mutex); + _dsa_unregister_switch(ds); + mutex_unlock(&dsa2_mutex); +} +EXPORT_SYMBOL_GPL(dsa_unregister_switch); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 72f7b8989cfb..b42f1a5f95f3 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -59,7 +59,7 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); int dsa_slave_create(struct dsa_switch *ds, struct device *parent, - int port, char *name); + int port, const char *name); void dsa_slave_destroy(struct net_device *slave_dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 35e5f0f6688b..15a492261895 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1099,14 +1099,18 @@ int dsa_slave_resume(struct net_device *slave_dev) } int dsa_slave_create(struct dsa_switch *ds, struct device *parent, - int port, char *name) + int port, const char *name) { - struct net_device *master = ds->dst->master_netdev; struct dsa_switch_tree *dst = ds->dst; + struct net_device *master; struct net_device *slave_dev; struct dsa_slave_priv *p; int ret; + master = ds->dst->master_netdev; + if (ds->master_netdev) + master = ds->master_netdev; + slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), name, NET_NAME_UNKNOWN, ether_setup); if (slave_dev == NULL) -- cgit v1.2.3 From 30759219f562cfaaebe7b9c1d1c0e6b5445c69b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Fri, 27 May 2016 17:53:52 +0200 Subject: net: disable fragment reassembly if high_thresh is zero Before commit 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting"), setting the reassembly high threshold to 0 prevented fragment reassembly as first fragment would be always evicted before second could be added to the queue. While inefficient, some users apparently relied on this method. Since the commit mentioned above, a percpu counter is used for reassembly memory accounting and high batch size avoids taking slow path in most common scenarios. As a result, a whole full sized packet can be reassembled without the percpu counter's main counter changing its value so that even with high_thresh set to 0, fragmented packets can be still reassembled and processed. Add explicit check preventing reassembly if high threshold is zero. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 3a88b0c73797..b5e9317eaf9e 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -355,7 +355,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, { struct inet_frag_queue *q; - if (frag_mem_limit(nf) > nf->high_thresh) { + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { inet_frag_schedule_worker(f); return NULL; } -- cgit v1.2.3 From 402f9030cb68d235cfa94b898e96f2d6f7da76ae Mon Sep 17 00:00:00 2001 From: Tobin C Harding Date: Tue, 10 May 2016 11:26:57 +1000 Subject: bridge: netfilter: checkpatch data type fixes checkpatch produces data type 'checks'. This patch amends them by changing, for example: uint8_t -> u8 Signed-off-by: Tobin C Harding Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebt_stp.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 6b731e12ecfa..e77f90bf8db3 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -17,24 +17,24 @@ #define BPDU_TYPE_TCN 0x80 struct stp_header { - uint8_t dsap; - uint8_t ssap; - uint8_t ctrl; - uint8_t pid; - uint8_t vers; - uint8_t type; + u8 dsap; + u8 ssap; + u8 ctrl; + u8 pid; + u8 vers; + u8 type; }; struct stp_config_pdu { - uint8_t flags; - uint8_t root[8]; - uint8_t root_cost[4]; - uint8_t sender[8]; - uint8_t port[2]; - uint8_t msg_age[2]; - uint8_t max_age[2]; - uint8_t hello_time[2]; - uint8_t forward_delay[2]; + u8 flags; + u8 root[8]; + u8 root_cost[4]; + u8 sender[8]; + u8 port[2]; + u8 msg_age[2]; + u8 max_age[2]; + u8 hello_time[2]; + u8 forward_delay[2]; }; #define NR16(p) (p[0] << 8 | p[1]) @@ -44,8 +44,8 @@ static bool ebt_filter_config(const struct ebt_stp_info *info, const struct stp_config_pdu *stpc) { const struct ebt_stp_config_info *c; - uint16_t v16; - uint32_t v32; + u16 v16; + u32 v32; int verdict, i; c = &info->config; @@ -125,7 +125,7 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct ebt_stp_info *info = par->matchinfo; const struct stp_header *sp; struct stp_header _stph; - const uint8_t header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00}; + const u8 header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00}; sp = skb_header_pointer(skb, 0, sizeof(_stph), &_stph); if (sp == NULL) @@ -156,8 +156,8 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par) static int ebt_stp_mt_check(const struct xt_mtchk_param *par) { const struct ebt_stp_info *info = par->matchinfo; - const uint8_t bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00}; - const uint8_t msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + const u8 bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00}; + const u8 msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; const struct ebt_entry *e = par->entryinfo; if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK || -- cgit v1.2.3 From 436a850dd9cac09bf88e12e20cc79408b1d29788 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 15 May 2016 19:50:14 +0200 Subject: netfilter: helper: avoid extra expectation iterations on unregister The expectation table is not duplicated per net namespace anymore, so we can move the expectation table and conntrack table iteration out of the per-net loop. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_helper.c | 61 +++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index f703adb7e5f7..7ba16e9c69fa 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -388,13 +388,40 @@ EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, struct net *net) +{ + struct nf_conntrack_tuple_hash *h; + const struct hlist_nulls_node *nn; + int cpu; + + /* Get rid of expecteds, set helpers to NULL. */ + for_each_possible_cpu(cpu) { + struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) + unhelp(h, me); + spin_unlock_bh(&pcpu->lock); + } +} + +void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_expect *exp; const struct hlist_node *next; const struct hlist_nulls_node *nn; + struct net *net; unsigned int i; - int cpu; + + mutex_lock(&nf_ct_helper_mutex); + hlist_del_rcu(&me->hnode); + nf_ct_helper_count--; + mutex_unlock(&nf_ct_helper_mutex); + + /* Make sure every nothing is still using the helper unless its a + * connection in the hash. + */ + synchronize_rcu(); /* Get rid of expectations */ spin_lock_bh(&nf_conntrack_expect_lock); @@ -414,15 +441,11 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, } spin_unlock_bh(&nf_conntrack_expect_lock); - /* Get rid of expecteds, set helpers to NULL. */ - for_each_possible_cpu(cpu) { - struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + rtnl_lock(); + for_each_net(net) + __nf_conntrack_helper_unregister(me, net); + rtnl_unlock(); - spin_lock_bh(&pcpu->lock); - hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) - unhelp(h, me); - spin_unlock_bh(&pcpu->lock); - } local_bh_disable(); for (i = 0; i < nf_conntrack_htable_size; i++) { nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); @@ -434,26 +457,6 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, } local_bh_enable(); } - -void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) -{ - struct net *net; - - mutex_lock(&nf_ct_helper_mutex); - hlist_del_rcu(&me->hnode); - nf_ct_helper_count--; - mutex_unlock(&nf_ct_helper_mutex); - - /* Make sure every nothing is still using the helper unless its a - * connection in the hash. - */ - synchronize_rcu(); - - rtnl_lock(); - for_each_net(net) - __nf_conntrack_helper_unregister(me, net); - rtnl_unlock(); -} EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); static struct nf_ct_ext_type helper_extend __read_mostly = { -- cgit v1.2.3 From 3bcb846ca4cf55415d3719e64bb45a124792c589 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Jun 2016 20:02:28 -0700 Subject: net: get rid of spin_trylock() in net_tx_action() Note: Tom Herbert posted almost same patch 3 months back, but for different reasons. The reasons we want to get rid of this spin_trylock() are : 1) Under high qdisc pressure, the spin_trylock() has almost no chance to succeed. 2) We loop multiple times in softirq handler, eventually reaching the max retry count (10), and we schedule ksoftirqd. Since we want to adhere more strictly to ksoftirqd being waked up in the future (https://lwn.net/Articles/687617/), better avoid spurious wakeups. 3) calls to __netif_reschedule() dirty the cache line containing q->next_sched, slowing down the owner of qdisc. 4) RT kernels can not use the spin_trylock() here. With help of busylock, we get the qdisc spinlock fast enough, and the trylock trick brings only performance penalty. Depending on qdisc setup, I observed a gain of up to 19 % in qdisc performance (1016600 pps instead of 853400 pps, using prio+tbf+fq_codel) ("mpstat -I SCPU 1" is much happier now) Signed-off-by: Eric Dumazet Cc: Tom Herbert Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 904ff431d570..896b686d1966 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2253,7 +2253,7 @@ int netif_get_num_default_rss_queues(void) } EXPORT_SYMBOL(netif_get_num_default_rss_queues); -static inline void __netif_reschedule(struct Qdisc *q) +static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; @@ -3898,22 +3898,14 @@ static void net_tx_action(struct softirq_action *h) head = head->next_sched; root_lock = qdisc_lock(q); - if (spin_trylock(root_lock)) { - smp_mb__before_atomic(); - clear_bit(__QDISC_STATE_SCHED, - &q->state); - qdisc_run(q); - spin_unlock(root_lock); - } else { - if (!test_bit(__QDISC_STATE_DEACTIVATED, - &q->state)) { - __netif_reschedule(q); - } else { - smp_mb__before_atomic(); - clear_bit(__QDISC_STATE_SCHED, - &q->state); - } - } + spin_lock(root_lock); + /* We need to make sure head->next_sched is read + * before clearing __QDISC_STATE_SCHED + */ + smp_mb__before_atomic(); + clear_bit(__QDISC_STATE_SCHED, &q->state); + qdisc_run(q); + spin_unlock(root_lock); } } } -- cgit v1.2.3 From e69985c67c33f1d981a87986237366e83a8f0e13 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Sun, 5 Jun 2016 17:11:18 +0300 Subject: net/sched: cls_flower: Introduce support in SKIP SW flag In order to make a filter processed only by hardware, skip_sw flag should be supplied. This is an addition to the already existing skip_hw flag (filter will be processed by software only). If no flag is specified, filter will be processed by both software and hardware. If only hardware offloaded filters exist, fl_classify() will return without doing anything. A following userspace patch will be sent once kernel patch is accepted. Example: tc filter add dev enp0s9 protocol ip prio 20 parent ffff: \ flower \ ip_proto 6 \ indev enp0s9 \ skip_sw \ action skbedit mark 0x1234 Signed-off-by: Amir Vadai Acked-by: Jiri Pirko Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 730aacafc22d..d737492b925d 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -66,6 +66,7 @@ struct cls_fl_filter { struct fl_flow_key key; struct list_head list; u32 handle; + u32 flags; struct rcu_head rcu; }; @@ -123,6 +124,9 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; + if (!atomic_read(&head->ht.nelems)) + return -1; + fl_clear_masked_range(&skb_key, &head->mask); skb_key.indev_ifindex = skb->skb_iif; /* skb_flow_dissect() does not set n_proto in case an unknown protocol, @@ -136,7 +140,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, f = rhashtable_lookup_fast(&head->ht, fl_key_get_start(&skb_mkey, &head->mask), head->ht_params); - if (f) { + if (f && !(f->flags & TCA_CLS_FLAGS_SKIP_SW)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); } @@ -524,7 +528,6 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, struct cls_fl_filter *fnew; struct nlattr *tb[TCA_FLOWER_MAX + 1]; struct fl_flow_mask mask = {}; - u32 flags = 0; int err; if (!tca[TCA_OPTIONS]) @@ -552,8 +555,14 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } fnew->handle = handle; - if (tb[TCA_FLOWER_FLAGS]) - flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]); + if (tb[TCA_FLOWER_FLAGS]) { + fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]); + + if (!tc_flags_valid(fnew->flags)) { + err = -EINVAL; + goto errout; + } + } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) @@ -563,10 +572,12 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout; - err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, - head->ht_params); - if (err) - goto errout; + if (!(fnew->flags & TCA_CLS_FLAGS_SKIP_SW)) { + err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, + head->ht_params); + if (err) + goto errout; + } fl_hw_replace_filter(tp, &head->dissector, @@ -574,7 +585,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, &fnew->key, &fnew->exts, (unsigned long)fnew, - flags); + fnew->flags); if (fold) { rhashtable_remove_fast(&head->ht, &fold->ht_node, @@ -734,6 +745,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->tp.dst)))) goto nla_put_failure; + nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); + if (tcf_exts_dump(skb, &f->exts)) goto nla_put_failure; -- cgit v1.2.3 From 9c4a4e488bc8f55dfc8782c7d7757fb058e9088e Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 6 Jun 2016 06:32:53 -0400 Subject: net sched: actions use tcf_lastuse_update for consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_connmark.c | 2 +- net/sched/act_csum.c | 2 +- net/sched/act_ife.c | 6 +++--- net/sched/act_ipt.c | 2 +- net/sched/act_mirred.c | 1 - net/sched/act_nat.c | 2 +- net/sched/act_pedit.c | 2 +- net/sched/act_simple.c | 2 +- net/sched/act_skbedit.c | 2 +- net/sched/act_vlan.c | 2 +- 10 files changed, 11 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 2ba700c765e0..e0e6c6876bc7 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -44,7 +44,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, int proto; spin_lock(&ca->tcf_lock); - ca->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&ca->tcf_tm); bstats_update(&ca->tcf_bstats, skb); if (skb->protocol == htons(ETH_P_IP)) { diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 28e934ed038a..065f71618276 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -501,7 +501,7 @@ static int tcf_csum(struct sk_buff *skb, u32 update_flags; spin_lock(&p->tcf_lock); - p->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&p->tcf_tm); bstats_update(&p->tcf_bstats, skb); action = p->tcf_action; update_flags = p->update_flags; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 658046dfe02d..649157624f46 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -623,7 +623,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, spin_lock(&ife->tcf_lock); bstats_update(&ife->tcf_bstats, skb); - ife->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&ife->tcf_tm); spin_unlock(&ife->tcf_lock); ifehdrln = ntohs(ifehdrln); @@ -711,7 +711,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, spin_lock(&ife->tcf_lock); bstats_update(&ife->tcf_bstats, skb); - ife->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ /* abuse overlimits to count when we allow packet @@ -802,7 +802,7 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, pr_info_ratelimited("unknown failure(policy neither de/encode\n"); spin_lock(&ife->tcf_lock); bstats_update(&ife->tcf_bstats, skb); - ife->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&ife->tcf_tm); ife->tcf_qstats.drops++; spin_unlock(&ife->tcf_lock); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 9f002ada7074..30e9087b3536 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -212,7 +212,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, spin_lock(&ipt->tcf_lock); - ipt->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&ipt->tcf_tm); bstats_update(&ipt->tcf_bstats, skb); /* yes, we have to worry about both in and out dev diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 128942bc9e42..d3ac73e90f00 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -157,7 +157,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, u32 at; tcf_lastuse_update(&m->tcf_tm); - bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); rcu_read_lock(); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index c0a879f940de..9135aa8f2970 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -103,7 +103,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, spin_lock(&p->tcf_lock); - p->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&p->tcf_tm); old_addr = p->old_addr; new_addr = p->new_addr; mask = p->mask; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index c6e18f230af6..67a17265c967 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -121,7 +121,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, spin_lock(&p->tcf_lock); - p->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&p->tcf_tm); if (p->tcfp_nkeys > 0) { struct tc_pedit_key *tkey = p->tcfp_keys; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index e42f8daca147..f95d1c596986 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -35,7 +35,7 @@ static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, struct tcf_defact *d = a->priv; spin_lock(&d->tcf_lock); - d->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&d->tcf_tm); bstats_update(&d->tcf_bstats, skb); /* print policy string followed by _ then packet count diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index e928802966bc..82105c8cb60f 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -37,7 +37,7 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_skbedit *d = a->priv; spin_lock(&d->tcf_lock); - d->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&d->tcf_tm); bstats_update(&d->tcf_bstats, skb); if (d->flags & SKBEDIT_F_PRIORITY) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index ac4adc812c12..da5120f5513f 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -31,7 +31,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, int err; spin_lock(&v->tcf_lock); - v->tcf_tm.lastuse = jiffies; + tcf_lastuse_update(&v->tcf_tm); bstats_update(&v->tcf_bstats, skb); action = v->tcf_action; -- cgit v1.2.3 From 53eb440f4ada034ea43b295891feec3df0fa7a29 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 6 Jun 2016 06:32:54 -0400 Subject: net sched actions: introduce timestamp for firsttime use Useful to know when the action was first used for accounting (and debugging) Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_api.c | 1 + net/sched/act_bpf.c | 1 + net/sched/act_connmark.c | 1 + net/sched/act_csum.c | 1 + net/sched/act_gact.c | 1 + net/sched/act_ipt.c | 1 + net/sched/act_mirred.c | 1 + net/sched/act_nat.c | 1 + net/sched/act_pedit.c | 1 + net/sched/act_police.c | 2 ++ net/sched/act_simple.c | 1 + net/sched/act_skbedit.c | 1 + net/sched/act_vlan.c | 1 + 13 files changed, 14 insertions(+) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 336774a535c3..5ebf6d6f85f6 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -283,6 +283,7 @@ err2: p->tcfc_index = index ? index : tcf_hash_new_index(tn); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; + p->tcfc_tm.firstuse = 0; if (est) { err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats, &p->tcfc_rate_est, diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index c7123e01c2ca..e4b877f9d322 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -156,6 +156,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install); tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse); + tm.firstuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.firstuse); tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires); if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm, diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e0e6c6876bc7..e3f64f2d6206 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -163,6 +163,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(ci->tcf_tm.expires); + t.firstuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.firstuse); if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 065f71618276..7725eafbe581 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -548,6 +548,7 @@ static int tcf_csum_dump(struct sk_buff *skb, goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index ec5cc8435238..c9d59f38a3f8 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -190,6 +190,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int #endif t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 30e9087b3536..47525ee201a9 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -279,6 +279,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int goto nla_put_failure; tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); + tm.firstuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.firstuse); tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index d3ac73e90f00..1b06093f8ef8 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -220,6 +220,7 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - m->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(m->tcf_tm.expires); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 9135aa8f2970..9fbf780a04c7 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -266,6 +266,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 67a17265c967..fb89275bc595 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -202,6 +202,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index b884dae692a1..820b11686f85 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -241,6 +241,7 @@ override: tcf_hash_new_index(tn); police->tcf_tm.install = jiffies; police->tcf_tm.lastuse = jiffies; + police->tcf_tm.firstuse = 0; h = tcf_hash(police->tcf_index, POL_TAB_MASK); spin_lock_bh(&hinfo->lock); hlist_add_head(&police->tcf_head, &hinfo->htab[h]); @@ -347,6 +348,7 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(police->tcf_tm.expires); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index f95d1c596986..81040f1cc3bb 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -160,6 +160,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 82105c8cb60f..cf34f31f4106 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -170,6 +170,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index da5120f5513f..978ec4c89684 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -184,6 +184,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - v->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(v->tcf_tm.expires); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; -- cgit v1.2.3 From 48d8ee1694dd1ab25614b58f968123a4598f887e Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 6 Jun 2016 06:32:55 -0400 Subject: net sched actions: aggregate dumping of actions timeinfo Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 6 +----- net/sched/act_connmark.c | 5 +---- net/sched/act_csum.c | 6 ++---- net/sched/act_gact.c | 5 +---- net/sched/act_ife.c | 4 +--- net/sched/act_ipt.c | 7 +++---- net/sched/act_mirred.c | 6 ++---- net/sched/act_nat.c | 6 ++---- net/sched/act_pedit.c | 7 +++---- net/sched/act_simple.c | 6 ++---- net/sched/act_skbedit.c | 6 ++---- net/sched/act_vlan.c | 5 +---- 12 files changed, 21 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index e4b877f9d322..ae0e7cbe488c 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -154,11 +154,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, if (ret) goto nla_put_failure; - tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install); - tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse); - tm.firstuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.firstuse); - tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires); - + tcf_tm_dump(&tm, &prog->tcf_tm); if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm, TCA_ACT_BPF_PAD)) goto nla_put_failure; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e3f64f2d6206..35a5270f289d 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -160,10 +160,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(ci->tcf_tm.expires); - t.firstuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.firstuse); + tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 7725eafbe581..dcd9ababd351 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -546,10 +546,8 @@ static int tcf_csum_dump(struct sk_buff *skb, if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + + tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index c9d59f38a3f8..4c6e0085054a 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -188,10 +188,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int goto nla_put_failure; } #endif - t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); + tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 649157624f46..02f5a8ba95d7 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -553,9 +553,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(ife->tcf_tm.expires); + tcf_tm_dump(&t, &ife->tcf_tm); if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 47525ee201a9..3fcde44b8f4d 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -277,12 +277,11 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) || nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname)) goto nla_put_failure; - tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); - tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); - tm.firstuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.firstuse); - tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); + + tcf_tm_dump(&tm, &ipt->tcf_tm); if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; + kfree(t); return skb->len; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1b06093f8ef8..787751a7981a 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -218,10 +218,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - m->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(m->tcf_tm.expires); + + tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 9fbf780a04c7..06ccb03f25da 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -264,10 +264,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + + tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index fb89275bc595..82d3c1479029 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -200,12 +200,11 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + + tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; + kfree(opt); return skb->len; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 81040f1cc3bb..be5fbb51cfed 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -158,10 +158,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(d->tcf_tm.expires); + + tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index cf34f31f4106..7e2bc3c2b6da 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -168,10 +168,8 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), &d->mark)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(d->tcf_tm.expires); + + tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 978ec4c89684..f0a08a11f54f 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -182,10 +182,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto))) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - v->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(v->tcf_tm.expires); + tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; return skb->len; -- cgit v1.2.3 From 0b0f43fe2e7291aa97b1febeaa5a0de453d007ca Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 5 Jun 2016 10:41:32 -0400 Subject: net sched: indentation and other OCD stylistic fixes Signed-off-by: Jamal Hadi Salim Acked-by: Cong Wang --- net/sched/act_api.c | 19 +++++++++++-------- net/sched/act_bpf.c | 3 ++- net/sched/act_gact.c | 3 ++- net/sched/act_ipt.c | 6 ++++-- net/sched/act_vlan.c | 3 ++- net/sched/cls_api.c | 11 +++++++---- 6 files changed, 28 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 5ebf6d6f85f6..719bc2e85852 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -504,8 +504,8 @@ nla_put_failure: } EXPORT_SYMBOL(tcf_action_dump_1); -int -tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref) +int tcf_action_dump(struct sk_buff *skb, struct list_head *actions, + int bind, int ref) { struct tc_action *a; int err = -EINVAL; @@ -688,9 +688,9 @@ errout: return -1; } -static int -tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq, - u16 flags, int event, int bind, int ref) +static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, + u32 portid, u32 seq, u16 flags, int event, int bind, + int ref) { struct tcamsg *t; struct nlmsghdr *nlh; @@ -731,7 +731,8 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, + 0, 0) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -839,7 +840,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, if (a.ops == NULL) /*some idjot trying to flush unknown action */ goto err_out; - nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); + nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, + sizeof(*t), 0); if (!nlh) goto out_module_put; t = nlmsg_data(nlh); @@ -1002,7 +1004,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n) u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; - if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) + if ((n->nlmsg_type != RTM_GETACTION) && + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index ae0e7cbe488c..f7b6cf49ea6f 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -169,7 +169,8 @@ nla_put_failure: static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = { [TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) }, [TCA_ACT_BPF_FD] = { .type = NLA_U32 }, - [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN }, + [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, + .len = ACT_BPF_NAME_LEN }, [TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_ACT_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 4c6e0085054a..19058a7f3e5c 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -162,7 +162,8 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, tm->lastuse = lastuse; } -static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_gact *gact = a->priv; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 3fcde44b8f4d..e7c0f4d944a2 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -34,7 +34,8 @@ static int ipt_net_id; static int xt_net_id; -static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) +static int ipt_init_target(struct xt_entry_target *t, char *table, + unsigned int hook) { struct xt_tgchk_param par; struct xt_target *target; @@ -250,7 +251,8 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, } -static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, + int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_ipt *ipt = a->priv; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index f0a08a11f54f..b075d50e0fc3 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -179,7 +179,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, if (v->tcfv_action == TCA_VLAN_ACT_PUSH && (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || - nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto))) + nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, + v->tcfv_push_proto))) goto nla_put_failure; tcf_tm_dump(&t, &v->tcf_tm); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a75864d93142..aafa6bce173e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -351,8 +351,9 @@ errout: return err; } -static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, - unsigned long fh, u32 portid, u32 seq, u16 flags, int event) +static int tcf_fill_node(struct net *net, struct sk_buff *skb, + struct tcf_proto *tp, unsigned long fh, u32 portid, + u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -474,9 +475,11 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) TC_H_MIN(tcm->tcm_info) != tp->protocol) continue; if (t > s_t) - memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + memset(&cb->args[1], 0, + sizeof(cb->args)-sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid, + if (tcf_fill_node(net, skb, tp, 0, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) break; -- cgit v1.2.3 From f9eb8aea2a1e12fc2f584d1627deeb957435a801 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jun 2016 09:37:15 -0700 Subject: net_sched: transform qdisc running bit into a seqcount Instead of using a single bit (__QDISC___STATE_RUNNING) in sch->__state, use a seqcount. This adds lockdep support, but more importantly it will allow us to sample qdisc/class statistics without having to grab qdisc root lock. Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/bluetooth/6lowpan.c | 2 ++ net/core/dev.c | 2 +- net/ieee802154/6lowpan/core.c | 3 +++ net/l2tp/l2tp_eth.c | 4 ++++ net/sched/sch_generic.c | 14 ++++++++++---- 5 files changed, 20 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 780089d75915..977a11e418d0 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -629,6 +629,7 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) static struct lock_class_key bt_tx_busylock; static struct lock_class_key bt_netdev_xmit_lock_key; +static struct lock_class_key bt_qdisc_running_key; static void bt_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, @@ -641,6 +642,7 @@ static int bt_dev_init(struct net_device *dev) { netdev_for_each_tx_queue(dev, bt_set_lockdep_class_one, NULL); dev->qdisc_tx_busylock = &bt_tx_busylock; + dev->qdisc_running_key = &bt_qdisc_running_key; return 0; } diff --git a/net/core/dev.c b/net/core/dev.c index 896b686d1966..e0bcc39f4a7d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3075,7 +3075,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. - * This permits __QDISC___STATE_RUNNING owner to get the lock more + * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. */ contended = qdisc_is_running(q); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index dd085db8580e..14aa5effd29a 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -60,6 +60,7 @@ static struct header_ops lowpan_header_ops = { static struct lock_class_key lowpan_tx_busylock; static struct lock_class_key lowpan_netdev_xmit_lock_key; +static struct lock_class_key lowpan_qdisc_running_key; static void lowpan_set_lockdep_class_one(struct net_device *ldev, struct netdev_queue *txq, @@ -73,6 +74,8 @@ static int lowpan_dev_init(struct net_device *ldev) { netdev_for_each_tx_queue(ldev, lowpan_set_lockdep_class_one, NULL); ldev->qdisc_tx_busylock = &lowpan_tx_busylock; + ldev->qdisc_running_key = &lowpan_qdisc_running_key; + return 0; } diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index e253c26f31ac..c00d72d182fa 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -68,6 +68,8 @@ static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) } static struct lock_class_key l2tp_eth_tx_busylock; +static struct lock_class_key l2tp_qdisc_running_key; + static int l2tp_eth_dev_init(struct net_device *dev) { struct l2tp_eth *priv = netdev_priv(dev); @@ -76,6 +78,8 @@ static int l2tp_eth_dev_init(struct net_device *dev) eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); dev->qdisc_tx_busylock = &l2tp_eth_tx_busylock; + dev->qdisc_running_key = &l2tp_qdisc_running_key; + return 0; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 269dd71b3828..cebea73e70ac 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -110,7 +110,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, /* * Transmit possibly several skbs, and handle the return status as - * required. Holding the __QDISC___STATE_RUNNING bit guarantees that + * required. Owning running seqcount bit guarantees that * only one CPU can execute this function. * * Returns to the caller: @@ -137,10 +137,10 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_UNLOCK(dev, txq); } else { - spin_lock(root_lock); + spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING); return qdisc_qlen(q); } - spin_lock(root_lock); + spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING); if (dev_xmit_complete(ret)) { /* Driver sent out skb successfully or skb was consumed */ @@ -163,7 +163,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, /* * NOTE: Called under qdisc_lock(q) with locally disabled BH. * - * __QDISC___STATE_RUNNING guarantees only one CPU can process + * running seqcount guarantees only one CPU can process * this qdisc at a time. qdisc_lock(q) serializes queue accesses for * this queue. * @@ -379,6 +379,7 @@ struct Qdisc noop_qdisc = { .list = LIST_HEAD_INIT(noop_qdisc.list), .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, + .running = SEQCNT_ZERO(noop_qdisc.running), .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), }; EXPORT_SYMBOL(noop_qdisc); @@ -537,6 +538,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; +static struct lock_class_key qdisc_running_key; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops) @@ -570,6 +572,10 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + seqcount_init(&sch->running); + lockdep_set_class(&sch->running, + dev->qdisc_running_key ?: &qdisc_running_key); + sch->ops = ops; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; -- cgit v1.2.3 From edb09eb17ed89eaa82a52dd306beac93e292b485 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jun 2016 09:37:16 -0700 Subject: net: sched: do not acquire qdisc spinlock in qdisc/class stats dump Large tc dumps (tc -s {qdisc|class} sh dev ethX) done by Google BwE host agent [1] are problematic at scale : For each qdisc/class found in the dump, we currently lock the root qdisc spinlock in order to get stats. Sampling stats every 5 seconds from thousands of HTB classes is a challenge when the root qdisc spinlock is under high pressure. Not only the dumps take time, they also slow down the fast path (queue/dequeue packets) by 10 % to 20 % in some cases. An audit of existing qdiscs showed that sch_fq_codel is the only qdisc that might need the qdisc lock in fq_codel_dump_stats() and fq_codel_dump_class_stats() In v2 of this patch, I now use the Qdisc running seqcount to provide consistent reads of packets/bytes counters, regardless of 32/64 bit arches. I also changed rate estimators to use the same infrastructure so that they no longer need to lock root qdisc lock. [1] http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43838.pdf Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Jamal Hadi Salim Cc: John Fastabend Cc: Kevin Athey Cc: Xiaotian Pei Signed-off-by: David S. Miller --- net/core/gen_estimator.c | 24 ++++++++++++++++-------- net/core/gen_stats.c | 34 +++++++++++++++++++++++----------- net/netfilter/xt_RATEEST.c | 2 +- net/sched/act_api.c | 4 ++-- net/sched/act_police.c | 3 ++- net/sched/sch_api.c | 21 +++++++++++---------- net/sched/sch_atm.c | 3 ++- net/sched/sch_cbq.c | 9 ++++++--- net/sched/sch_drr.c | 9 ++++++--- net/sched/sch_fq_codel.c | 15 +++++++++++---- net/sched/sch_hfsc.c | 10 +++++----- net/sched/sch_htb.c | 11 ++++++----- net/sched/sch_mq.c | 2 +- net/sched/sch_mqprio.c | 11 +++++++---- net/sched/sch_multiq.c | 3 ++- net/sched/sch_prio.c | 3 ++- net/sched/sch_qfq.c | 9 ++++++--- 17 files changed, 109 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 4573d81093fe..cad8e791f28e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -84,6 +84,7 @@ struct gen_estimator struct gnet_stats_basic_packed *bstats; struct gnet_stats_rate_est64 *rate_est; spinlock_t *stats_lock; + seqcount_t *running; int ewma_log; u32 last_packets; unsigned long avpps; @@ -121,26 +122,28 @@ static void est_timer(unsigned long arg) unsigned long rate; u64 brate; - spin_lock(e->stats_lock); + if (e->stats_lock) + spin_lock(e->stats_lock); read_lock(&est_lock); if (e->bstats == NULL) goto skip; - __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats); + __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats); brate = (b.bytes - e->last_bytes)<<(7 - idx); e->last_bytes = b.bytes; e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); - e->rate_est->bps = (e->avbps+0xF)>>5; + WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5); rate = b.packets - e->last_packets; rate <<= (7 - idx); e->last_packets = b.packets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); - e->rate_est->pps = (e->avpps + 0xF) >> 5; + WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5); skip: read_unlock(&est_lock); - spin_unlock(e->stats_lock); + if (e->stats_lock) + spin_unlock(e->stats_lock); } if (!list_empty(&elist[idx].list)) @@ -194,6 +197,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @stats_lock: statistics lock + * @running: qdisc running seqcount * @opt: rate estimator configuration TLV * * Creates a new rate estimator with &bstats as source and &rate_est @@ -209,6 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, + seqcount_t *running, struct nlattr *opt) { struct gen_estimator *est; @@ -226,12 +231,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (est == NULL) return -ENOBUFS; - __gnet_stats_copy_basic(&b, cpu_bstats, bstats); + __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats); idx = parm->interval + 2; est->bstats = bstats; est->rate_est = rate_est; est->stats_lock = stats_lock; + est->running = running; est->ewma_log = parm->ewma_log; est->last_bytes = b.bytes; est->avbps = rate_est->bps<<5; @@ -291,6 +297,7 @@ EXPORT_SYMBOL(gen_kill_estimator); * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @stats_lock: statistics lock + * @running: qdisc running seqcount (might be NULL) * @opt: rate estimator configuration TLV * * Replaces the configuration of a rate estimator by calling @@ -301,10 +308,11 @@ EXPORT_SYMBOL(gen_kill_estimator); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, - spinlock_t *stats_lock, struct nlattr *opt) + spinlock_t *stats_lock, + seqcount_t *running, struct nlattr *opt) { gen_kill_estimator(bstats, rate_est); - return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt); + return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt); } EXPORT_SYMBOL(gen_replace_estimator); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index f96ee8b9478d..d9c210caff32 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -32,10 +32,11 @@ gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) return 0; nla_put_failure: + if (d->lock) + spin_unlock_bh(d->lock); kfree(d->xstats); d->xstats = NULL; d->xstats_len = 0; - spin_unlock_bh(d->lock); return -1; } @@ -65,15 +66,16 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, { memset(d, 0, sizeof(*d)); - spin_lock_bh(lock); - d->lock = lock; if (type) d->tail = (struct nlattr *)skb_tail_pointer(skb); d->skb = skb; d->compat_tc_stats = tc_stats_type; d->compat_xstats = xstats_type; d->padattr = padattr; - + if (lock) { + d->lock = lock; + spin_lock_bh(lock); + } if (d->tail) return gnet_stats_copy(d, type, NULL, 0, padattr); @@ -126,16 +128,23 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, } void -__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, +__gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b) { + unsigned int seq; + if (cpu) { __gnet_stats_copy_basic_cpu(bstats, cpu); - } else { + return; + } + do { + if (running) + seq = read_seqcount_begin(running); bstats->bytes = b->bytes; bstats->packets = b->packets; - } + } while (running && read_seqcount_retry(running, seq)); } EXPORT_SYMBOL(__gnet_stats_copy_basic); @@ -152,13 +161,14 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(struct gnet_dump *d, +gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_dump *d, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b) { struct gnet_stats_basic_packed bstats = {0}; - __gnet_stats_copy_basic(&bstats, cpu, b); + __gnet_stats_copy_basic(running, &bstats, cpu, b); if (d->compat_tc_stats) { d->tc_stats.bytes = bstats.bytes; @@ -328,8 +338,9 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) return 0; err_out: + if (d->lock) + spin_unlock_bh(d->lock); d->xstats_len = 0; - spin_unlock_bh(d->lock); return -1; } EXPORT_SYMBOL(gnet_stats_copy_app); @@ -363,10 +374,11 @@ gnet_stats_finish_copy(struct gnet_dump *d) return -1; } + if (d->lock) + spin_unlock_bh(d->lock); kfree(d->xstats); d->xstats = NULL; d->xstats_len = 0; - spin_unlock_bh(d->lock); return 0; } EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 604df6fae6fc..515131f9e021 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -137,7 +137,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) cfg.est.ewma_log = info->ewma_log; ret = gen_new_estimator(&est->bstats, NULL, &est->rstats, - &est->lock, &cfg.opt); + &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 719bc2e85852..b6db56ec8117 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -287,7 +287,7 @@ err2: if (est) { err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats, &p->tcfc_rate_est, - &p->tcfc_lock, est); + &p->tcfc_lock, NULL, est); if (err) { free_percpu(p->cpu_qstats); goto err2; @@ -671,7 +671,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (err < 0) goto errout; - if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 || + if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 820b11686f85..bcb31142556b 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -185,7 +185,8 @@ override: if (est) { err = gen_replace_estimator(&police->tcf_bstats, NULL, &police->tcf_rate_est, - &police->tcf_lock, est); + &police->tcf_lock, + NULL, est); if (err) goto failure_unlock; } else if (tb[TCA_POLICE_AVRATE] && diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ddf047df5361..d4a8bbfcc953 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -982,7 +982,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { - spinlock_t *root_lock; + seqcount_t *running; err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) @@ -991,14 +991,15 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, if ((sch->parent != TC_H_ROOT) && !(sch->flags & TCQ_F_INGRESS) && (!p || !(p->flags & TCQ_F_MQROOT))) - root_lock = qdisc_root_sleeping_lock(sch); + running = qdisc_root_sleeping_running(sch); else - root_lock = qdisc_lock(sch); + running = &sch->running; err = gen_new_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, - root_lock, + NULL, + running, tca[TCA_RATE]); if (err) goto err_out4; @@ -1061,7 +1062,8 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) gen_replace_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); } out: @@ -1369,8 +1371,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, - qdisc_root_sleeping_lock(q), &d, - TCA_PAD) < 0) + NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) @@ -1381,7 +1382,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, cpu_qstats = q->cpu_qstats; } - if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), + &d, cpu_bstats, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; @@ -1684,8 +1686,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, - qdisc_root_sleeping_lock(q), &d, - TCA_PAD) < 0) + NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 1911af3ca7c0..34f8f79e56d5 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -637,7 +637,8 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct atm_flow_data *flow = (struct atm_flow_data *)arg; - if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &flow->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) return -1; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index baafddf229ce..1b8128fb845d 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1600,7 +1600,8 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (cl->undertime != PSCHED_PASTPERFECT) cl->xstats.undertime = cl->undertime - q->now; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0) return -1; @@ -1755,7 +1756,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { qdisc_put_rtab(rtab); @@ -1848,7 +1850,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { kfree(cl); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index a63e879e8975..1b7e1a27773d 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -91,7 +91,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; @@ -119,7 +120,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { qdisc_destroy(cl->qdisc); @@ -279,7 +281,8 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (qlen) xstats.deficit = cl->deficit; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0) return -1; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 6883a8971562..1daa54237f4e 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -566,11 +566,13 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.qdisc_stats.memory_usage = q->memory_usage; st.qdisc_stats.drop_overmemory = q->drop_overmemory; + sch_tree_lock(sch); list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; list_for_each(pos, &q->old_flows) st.qdisc_stats.old_flows_len++; + sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); } @@ -624,7 +626,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (idx < q->flows_cnt) { const struct fq_codel_flow *flow = &q->flows[idx]; - const struct sk_buff *skb = flow->head; + const struct sk_buff *skb; memset(&xstats, 0, sizeof(xstats)); xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; @@ -642,9 +644,14 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, codel_time_to_us(delta) : -codel_time_to_us(-delta); } - while (skb) { - qs.qlen++; - skb = skb->next; + if (flow->head) { + sch_tree_lock(sch); + skb = flow->head; + while (skb) { + qs.qlen++; + skb = skb->next; + } + sch_tree_unlock(sch); } qs.backlog = q->backlogs[idx]; qs.drops = flow->dropped; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index d783d7cc3348..74813dd49053 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1015,11 +1015,10 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cur_time = psched_get_time(); if (tca[TCA_RATE]) { - spinlock_t *lock = qdisc_root_sleeping_lock(sch); - err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - lock, + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; @@ -1068,7 +1067,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { kfree(cl); @@ -1373,7 +1373,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0) return -1; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index d4b4218af6b1..2b057649f24b 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1141,7 +1141,8 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; @@ -1395,7 +1396,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (htb_rate_est || tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE] ? : &est.nla); if (err) { kfree(cl); @@ -1457,11 +1459,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, parent->children++; } else { if (tca[TCA_RATE]) { - spinlock_t *lock = qdisc_root_sleeping_lock(sch); - err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - lock, + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 56a77b878eb3..b9439827c172 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -199,7 +199,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index b8002ce3d010..549c66359924 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -342,7 +342,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, * hold here is the look on dev_queue->qdisc_sleeping * also acquired below. */ - spin_unlock_bh(d->lock); + if (d->lock) + spin_unlock_bh(d->lock); for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); @@ -359,15 +360,17 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, spin_unlock_bh(qdisc_lock(qdisc)); } /* Reclaim root sleeping lock before completing stats */ - spin_lock_bh(d->lock); - if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 || + if (d->lock) + spin_lock_bh(d->lock); + if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &sch->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index bcdd54bb101c..21e69d2e8347 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -356,7 +356,8 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index fee1b15506b2..06eca7060683 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -319,7 +319,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 8d2d8d953432..85d41979d825 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -460,7 +460,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; @@ -486,7 +487,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) goto destroy_class; @@ -663,7 +665,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) -- cgit v1.2.3 From c1e48af7960e93e1fbe54934be8f4a2fb66ef6fd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 6 Jun 2016 16:06:02 -0700 Subject: gue: Implement direction IP encapsulation This patch implements direct encapsulation of IPv4 and IPv6 packets in UDP. This is done a version "1" of GUE and as explained in I-D draft-ietf-nvo3-gue-03. Changes here are only in the receive path, fou with IPxIPx already supports the transmit side. Both the normal receive path and GRO path are modified to check for GUE version and check for IP version in the case that GUE version is "1". Tested: IPIP with direct GUE encap 1 TCP_STREAM 4530 Mbps 200 TCP_RR 1297625 tps 135/232/444 90/95/99% latencies IP4IP6 with direct GUE encap 1 TCP_STREAM 4903 Mbps 200 TCP_RR 1184481 tps 149/253/473 90/95/99% latencies IP6IP6 direct GUE encap 1 TCP_STREAM 5146 Mbps 200 TCP_RR 1202879 tps 146/251/472 90/95/99% latencies SIT with direct GUE encap 1 TCP_STREAM 6111 Mbps 200 TCP_RR 1250337 tps 139/241/467 90/95/99% latencies Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/fou.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 5f9207c039e7..321d57f825ce 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -129,6 +129,36 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + switch (guehdr->version) { + case 0: /* Full GUE header present */ + break; + + case 1: { + /* Direct encasulation of IPv4 or IPv6 */ + + int prot; + + switch (((struct iphdr *)guehdr)->version) { + case 4: + prot = IPPROTO_IPIP; + break; + case 6: + prot = IPPROTO_IPV6; + break; + default: + goto drop; + } + + if (fou_recv_pull(skb, fou, sizeof(struct udphdr))) + goto drop; + + return -prot; + } + + default: /* Undefined version */ + goto drop; + } + optlen = guehdr->hlen << 2; len += optlen; @@ -289,6 +319,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, int flush = 1; struct fou *fou = fou_from_sock(sk); struct gro_remcsum grc; + u8 proto; skb_gro_remcsum_init(&grc); @@ -302,6 +333,25 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, goto out; } + switch (guehdr->version) { + case 0: + break; + case 1: + switch (((struct iphdr *)guehdr)->version) { + case 4: + proto = IPPROTO_IPIP; + break; + case 6: + proto = IPPROTO_IPV6; + break; + default: + goto out; + } + goto next_proto; + default: + goto out; + } + optlen = guehdr->hlen << 2; len += optlen; @@ -370,6 +420,10 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, } } + proto = guehdr->proto_ctype; + +next_proto: + /* We can clear the encap_mark for GUE as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are are simply @@ -383,7 +437,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[guehdr->proto_ctype]); + ops = rcu_dereference(offloads[proto]); if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) goto out_unlock; @@ -404,13 +458,30 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) const struct net_offload **offloads; struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); const struct net_offload *ops; - unsigned int guehlen; + unsigned int guehlen = 0; u8 proto; int err = -ENOENT; - proto = guehdr->proto_ctype; - - guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + switch (guehdr->version) { + case 0: + proto = guehdr->proto_ctype; + guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + break; + case 1: + switch (((struct iphdr *)guehdr)->version) { + case 4: + proto = IPPROTO_IPIP; + break; + case 6: + proto = IPPROTO_IPV6; + break; + default: + return err; + } + break; + default: + return err; + } rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; -- cgit v1.2.3 From b4869aa2f881ea4fcd36cd01ad591e4ed96eb33b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 6 Jun 2016 20:50:40 -0700 Subject: net: vrf: ipv6 support for local traffic to local addresses Add support for locally originated traffic to VRF-local IPv6 addresses. Similar to IPv4 a local dst is set on the skb and the packet is reinserted with a call to netif_rx. With this patch, ping, tcp and udp packets to a local IPv6 address are successfully routed: $ ip addr show dev eth1 4: eth1: mtu 1500 qdisc pfifo_fast master red state UP group default qlen 1000 link/ether 02:e0:f9:1c:b9:74 brd ff:ff:ff:ff:ff:ff inet 10.100.1.1/24 brd 10.100.1.255 scope global eth1 valid_lft forever preferred_lft forever inet6 2100:1::1/120 scope global valid_lft forever preferred_lft forever inet6 fe80::e0:f9ff:fe1c:b974/64 scope link valid_lft forever preferred_lft forever $ ping6 -c1 -I red 2100:1::1 ping6: Warning: source address might be selected on device other than red. PING 2100:1::1(2100:1::1) from 2100:1::1 red: 56 data bytes 64 bytes from 2100:1::1: icmp_seq=1 ttl=64 time=0.098 ms ip6_input is exported so the VRF driver can use it for the dst input function. The dst_alloc function for IPv4 defaults to setting the input and output functions; IPv6's does not. VRF does not need to duplicate the Rx path so just export the ipv6 input function. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 94611e450ec9..aacfb4bce153 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -323,6 +323,7 @@ int ip6_input(struct sk_buff *skb) dev_net(skb->dev), NULL, skb, skb->dev, NULL, ip6_input_finish); } +EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { -- cgit v1.2.3 From e00431bc93bb48c650273be4a00007b2a392d32a Mon Sep 17 00:00:00 2001 From: Pau Espin Pedrol Date: Tue, 7 Jun 2016 16:30:34 +0200 Subject: tcp: accept RST if SEQ matches right edge of right-most SACK block RFC 5961 advises to only accept RST packets containing a seq number matching the next expected seq number instead of the whole receive window in order to avoid spoofing attacks. However, this situation is not optimal in the case SACK is in use at the time the RST is sent. I recently run into a scenario in which packet losses were high while uploading data to a server, and userspace was willing to frequently terminate connections by sending a RST. In this case, the ACK sent on the receiver side (rcv_nxt) is frozen waiting for a lost packet retransmission and SACK blocks are used to let the client continue uploading data. At some point later on, the client sends the RST (snd_nxt), which matches the next expected seq number of the right-most SACK block on the receiver side which is going forward receiving data. In this scenario, as RFC 5961 defines, the RST SEQ doesn't match the frozen main ACK at receiver side and thus gets dropped and a challenge ACK is sent, which gets usually lost due to network conditions. The main consequence is that the connection stays alive for a while even if it made sense to accept the RST. This can get really bad if lots of connections like this one are created in few seconds, allocating all the resources of the server easily. For security reasons, not all SACK blocks are checked (there could be a big amount of SACK blocks => acceptable SEQ numbers). Furthermore, it wouldn't make sense to check for RST in blocks other than the right-most received one because the sender is not expected to be sending new data after the RST. For simplicity, only up to the 4 most recently updated SACK blocks (selective_acks[4] field) are compared to find the right-most block, as usually those are the ones with bigger probability to contain it. This patch was tested in a 3.18 kernel and probed to improve the situation in the scenario described above. Signed-off-by: Pau Espin Pedrol Acked-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6c8f4cd0800..89dd8d82826f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5159,6 +5159,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); + bool rst_seq_match = false; /* RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && @@ -5195,13 +5196,32 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* Step 2: check RST bit */ if (th->rst) { - /* RFC 5961 3.2 : - * If sequence number exactly matches RCV.NXT, then + /* RFC 5961 3.2 (extend to match against SACK too if available): + * If seq num matches RCV.NXT or the right-most SACK block, + * then * RESET the connection * else * Send a challenge ACK */ - if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + rst_seq_match = true; + } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int max_sack = sp[0].end_seq; + int this_sack; + + for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; + ++this_sack) { + max_sack = after(sp[this_sack].end_seq, + max_sack) ? + sp[this_sack].end_seq : max_sack; + } + + if (TCP_SKB_CB(skb)->seq == max_sack) + rst_seq_match = true; + } + + if (rst_seq_match) tcp_reset(sk); else tcp_send_challenge_ack(sk, skb); -- cgit v1.2.3 From 707a2ca4870fcf6b5480cdfad563b940f56f0844 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 7 Jun 2016 16:09:44 -0700 Subject: ila: Perform only one translation in forwarding path When setting up ILA in a router we noticed that the the encapsulation is invoked twice: once in the route input path and again upon route output. To resolve this we add a flag set_csum_neutral for the ila_update_ipv6_locator. If this flag is set and the checksum neutral bit is also set we assume that checksum-neutral translation has already been performed and take no further action. The flag is set only in ila_output path. The flag is not set for ila_input and ila_xlat. Tested: Used 3 netns to set to emulate a router and two hosts. The router translates SIR addresses between the two destinations in other two netns. Verified ping and netperf are functional. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ila/ila.h | 3 ++- net/ipv6/ila/ila_common.c | 6 ++++-- net/ipv6/ila/ila_lwt.c | 4 ++-- net/ipv6/ila/ila_xlat.c | 8 ++++---- 4 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index d08fd2d48a78..e0170f62bc39 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -109,7 +109,8 @@ static inline bool ila_csum_neutral_set(struct ila_identifier ident) return !!(ident.csum_neutral); } -void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p); +void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, + bool set_csum_neutral); void ila_init_saved_csum(struct ila_params *p); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 0e94042d1289..b3d00be484d4 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -103,7 +103,8 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, iaddr->loc = p->locator; } -void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) +void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, + bool set_csum_neutral) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); @@ -114,7 +115,8 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) * is a locator being translated to a SIR address. * Perform (receiver) checksum-neutral translation. */ - ila_csum_do_neutral(iaddr, p); + if (!set_csum_neutral) + ila_csum_do_neutral(iaddr, p); } else { switch (p->csum_mode) { case ILA_CSUM_ADJUST_TRANSPORT: diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 1dfb64166d7d..c8314c6b6154 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -26,7 +26,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); + ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), true); return dst->lwtstate->orig_output(net, sk, skb); @@ -42,7 +42,7 @@ static int ila_input(struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); + ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), false); return dst->lwtstate->orig_input(skb); diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index a90e57229c6c..e6eca5fdf4c9 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -210,14 +210,14 @@ static void ila_free_cb(void *ptr, void *arg) } } -static int ila_xlat_addr(struct sk_buff *skb); +static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral); static unsigned int ila_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - ila_xlat_addr(skb); + ila_xlat_addr(skb, false); return NF_ACCEPT; } @@ -597,7 +597,7 @@ static struct pernet_operations ila_net_ops = { .size = sizeof(struct ila_net), }; -static int ila_xlat_addr(struct sk_buff *skb) +static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -616,7 +616,7 @@ static int ila_xlat_addr(struct sk_buff *skb) ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) - ila_update_ipv6_locator(skb, &ila->xp.ip); + ila_update_ipv6_locator(skb, &ila->xp.ip, set_csum_neutral); rcu_read_unlock(); -- cgit v1.2.3 From 40e4e713ebb279eb569584836d7cc6b799ed7f7f Mon Sep 17 00:00:00 2001 From: Hariprasad Shenai Date: Wed, 8 Jun 2016 18:09:08 +0530 Subject: net: Reduce queue allocation to one in kdump kernel When in kdump kernel, reduce memory usage by only using a single Queue Set for multiqueue devices. So make netif_get_num_default_rss_queues() return one, when in kdump kernel. Signed-off-by: Hariprasad Shenai Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index e0bcc39f4a7d..c43c9d2a88cf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -139,6 +139,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2249,7 +2250,8 @@ EXPORT_SYMBOL(netif_set_real_num_rx_queues); */ int netif_get_num_default_rss_queues(void) { - return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); + return is_kdump_kernel() ? + 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); } EXPORT_SYMBOL(netif_get_num_default_rss_queues); -- cgit v1.2.3 From 123b36526592f009bf8eccb7c8833aeda296d9cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jun 2016 07:22:49 -0700 Subject: net: sched: fix missing doc annotations "make htmldocs" complains otherwise: .//net/core/gen_stats.c:168: warning: No description found for parameter 'running' .//include/linux/netdevice.h:1867: warning: No description found for parameter 'qdisc_running_key' Fixes: f9eb8aea2a1e ("net_sched: transform qdisc running bit into a seqcount") Fixes: edb09eb17ed8 ("net: sched: do not acquire qdisc spinlock in qdisc/class stats dump") Signed-off-by: Eric Dumazet Reported-by: kbuild test robot Signed-off-by: David S. Miller --- net/core/gen_stats.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index d9c210caff32..32207e6a942c 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -150,6 +150,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic); /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV + * @running: seqcount_t pointer * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics -- cgit v1.2.3 From 0b7b498d41709d68bb4f520051f68d64f752beee Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 7 Jun 2016 16:32:38 -0700 Subject: net: dsa: Provide unique DSA slave MII bus names In case we have multiples trees and switches with the same index, we need to add another discriminating id: the switch tree. Reviewed-by: Andrew Lunn Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 15a492261895..a51dfedf0014 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -49,7 +49,8 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->name = "dsa slave smi"; ds->slave_mii_bus->read = dsa_slave_phy_read; ds->slave_mii_bus->write = dsa_slave_phy_write; - snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d", ds->index); + snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d", + ds->dst->tree, ds->index); ds->slave_mii_bus->parent = ds->dev; ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask; } -- cgit v1.2.3 From 6e830d8f0deca91fcb84d3156dcebb20384a9e2d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 7 Jun 2016 16:32:39 -0700 Subject: net: dsa: Initialize ds->enabled_port_mask and ds->phys_mii_mask Some drivers rely on these two bitmasks to contain the correct values for them to successfully probe and initialize at drv->setup() time, calculate correct values to put in both masks as early as possible in dsa_get_ports_dn(). Reviewed-by: Andrew Lunn Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 80dfe08db825..921a36fd139d 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -283,6 +283,7 @@ static void dsa_user_port_unapply(struct device_node *port, u32 index, if (ds->ports[index].netdev) { dsa_slave_destroy(ds->ports[index].netdev); ds->ports[index].netdev = NULL; + ds->enabled_port_mask &= ~(1 << index); } } @@ -292,6 +293,13 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) u32 index; int err; + /* Initialize ds->phys_mii_mask before registering the slave MDIO bus + * driver and before drv->setup() has run, since the switch drivers and + * the slave MDIO bus driver rely on these values for probing PHY + * devices or not + */ + ds->phys_mii_mask = ds->enabled_port_mask; + err = ds->drv->setup(ds); if (err < 0) return err; @@ -511,6 +519,13 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds) return -EINVAL; ds->ports[reg].dn = port; + + /* Initialize enabled_port_mask now for drv->setup() + * to have access to a correct value, just like what + * net/dsa/dsa.c::dsa_switch_setup_one does. + */ + if (!dsa_port_is_cpu(port)) + ds->enabled_port_mask |= 1 << reg; } return 0; -- cgit v1.2.3 From 1eb59443e72c69edbb836626f9f7f7e82427eeac Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 7 Jun 2016 16:32:40 -0700 Subject: net: dsa: Provide a slave MII bus if needed Mimic what net/dsa/dsa.c does and provide a slave MII bus by default which will be created if the driver implements a phy_read method. Reviewed-by: Andrew Lunn Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 921a36fd139d..4e0f3c268103 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -312,6 +312,18 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) if (err < 0) return err; + if (!ds->slave_mii_bus && ds->drv->phy_read) { + ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); + if (!ds->slave_mii_bus) + return -ENOMEM; + + dsa_slave_mii_bus_init(ds); + + err = mdiobus_register(ds->slave_mii_bus); + if (err < 0) + return err; + } + for (index = 0; index < DSA_MAX_PORTS; index++) { port = ds->ports[index].dn; if (!port) @@ -361,6 +373,9 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) dsa_user_port_unapply(port, index, ds); } + + if (ds->slave_mii_bus && ds->drv->phy_read) + mdiobus_unregister(ds->slave_mii_bus); } static int dsa_dst_apply(struct dsa_switch_tree *dst) -- cgit v1.2.3 From af42192c47c41ec132bda736a78d6d5e0d2999a9 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 7 Jun 2016 16:32:41 -0700 Subject: net: dsa: Add initialization helper for CPU port ethtool_ops Add a helper function: dsa_cpu_port_ethtool_init() which initializes a custom ethtool_ops structure with custom DSA ethtool operations for CPU ports. This is a preliminary change to move the initialization outside of net/dsa/slave.c. Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 1 + net/dsa/slave.c | 14 ++++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index b42f1a5f95f3..106a9f067f94 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -58,6 +58,7 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); +void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops); int dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, const char *name); void dsa_slave_destroy(struct net_device *slave_dev); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a51dfedf0014..8d159932e082 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -865,6 +865,13 @@ static void dsa_slave_poll_controller(struct net_device *dev) } #endif +void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops) +{ + ops->get_sset_count = dsa_cpu_port_get_sset_count; + ops->get_ethtool_stats = dsa_cpu_port_get_ethtool_stats; + ops->get_strings = dsa_cpu_port_get_strings; +} + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_settings = dsa_slave_get_settings, .set_settings = dsa_slave_set_settings, @@ -1124,12 +1131,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, sizeof(struct ethtool_ops)); memcpy(&dsa_cpu_port_ethtool_ops, &dst->master_ethtool_ops, sizeof(struct ethtool_ops)); - dsa_cpu_port_ethtool_ops.get_sset_count = - dsa_cpu_port_get_sset_count; - dsa_cpu_port_ethtool_ops.get_ethtool_stats = - dsa_cpu_port_get_ethtool_stats; - dsa_cpu_port_ethtool_ops.get_strings = - dsa_cpu_port_get_strings; + dsa_cpu_port_ethtool_init(&dsa_cpu_port_ethtool_ops); master->ethtool_ops = &dsa_cpu_port_ethtool_ops; } eth_hw_addr_inherit(slave_dev, master); -- cgit v1.2.3 From 0c73c523cf737b5d446705392e0e14ee0411a351 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 7 Jun 2016 16:32:42 -0700 Subject: net: dsa: Initialize CPU port ethtool ops per tree Now that we can properly support multiple distinct trees in the system, using a global variable: dsa_cpu_port_ethtool_ops is getting clobbered as soon as the second switch tree gets probed, and we don't want that. We need to move this to be dynamically allocated, and since we can't really be comparing addresses anymore to determine first time initialization versus any other times, just move this to dsa.c and dsa2.c where the remainder of the dst/ds initialization happens. The operations teardown restores the master netdev's ethtool_ops to its original ethtool_ops pointer (typically within the Ethernet driver) Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/dsa/dsa2.c | 6 ++++++ net/dsa/dsa_priv.h | 2 ++ net/dsa/slave.c | 10 ---------- 4 files changed, 49 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index ce3b942dce76..766d2a525ada 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -266,6 +266,41 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) return ops; } +int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds) +{ + struct net_device *master; + struct ethtool_ops *cpu_ops; + + master = ds->dst->master_netdev; + if (ds->master_netdev) + master = ds->master_netdev; + + cpu_ops = devm_kzalloc(ds->dev, sizeof(*cpu_ops), GFP_KERNEL); + if (!cpu_ops) + return -ENOMEM; + + memcpy(&ds->dst->master_ethtool_ops, master->ethtool_ops, + sizeof(struct ethtool_ops)); + ds->dst->master_orig_ethtool_ops = master->ethtool_ops; + memcpy(cpu_ops, &ds->dst->master_ethtool_ops, + sizeof(struct ethtool_ops)); + dsa_cpu_port_ethtool_init(cpu_ops); + master->ethtool_ops = cpu_ops; + + return 0; +} + +void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds) +{ + struct net_device *master; + + master = ds->dst->master_netdev; + if (ds->master_netdev) + master = ds->master_netdev; + + master->ethtool_ops = ds->dst->master_orig_ethtool_ops; +} + static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { struct dsa_switch_driver *drv = ds->drv; @@ -379,6 +414,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) ret = 0; } + ret = dsa_cpu_port_ethtool_setup(ds); + if (ret) + return ret; + #ifdef CONFIG_NET_DSA_HWMON /* If the switch provides a temperature sensor, * register with hardware monitoring subsystem. @@ -963,6 +1002,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } + dsa_cpu_port_ethtool_restore(dst->ds[0]); + dev_put(dst->master_netdev); } diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 4e0f3c268103..83b95fc4cede 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -394,6 +394,10 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } + err = dsa_cpu_port_ethtool_setup(dst->ds[0]); + if (err) + return err; + /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. @@ -429,6 +433,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } + dsa_cpu_port_ethtool_restore(dst->ds[0]); + pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 106a9f067f94..00077a9c97f4 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -54,6 +54,8 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, struct device_node *port_dn, int port); void dsa_cpu_dsa_destroy(struct device_node *port_dn); const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); +int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds); +void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 8d159932e082..7236eb26dc97 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -892,8 +892,6 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_eee = dsa_slave_get_eee, }; -static struct ethtool_ops dsa_cpu_port_ethtool_ops; - static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, @@ -1126,14 +1124,6 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->features = master->vlan_features; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; - if (master->ethtool_ops != &dsa_cpu_port_ethtool_ops) { - memcpy(&dst->master_ethtool_ops, master->ethtool_ops, - sizeof(struct ethtool_ops)); - memcpy(&dsa_cpu_port_ethtool_ops, &dst->master_ethtool_ops, - sizeof(struct ethtool_ops)); - dsa_cpu_port_ethtool_init(&dsa_cpu_port_ethtool_ops); - master->ethtool_ops = &dsa_cpu_port_ethtool_ops; - } eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; -- cgit v1.2.3 From c4282ca76c5b81ed73ef4c5eb5c07ee397e51642 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 8 Jun 2016 12:00:04 -0400 Subject: tipc: correct error in node fsm commit 88e8ac7000dc ("tipc: reduce transmission rate of reset messages when link is down") revealed a flaw in the node FSM, as defined in the log of commit 66996b6c47ed ("tipc: extend node FSM"). We see the following scenario: 1: Node B receives a RESET message from node A before its link endpoint is fully up, i.e., the node FSM is in state SELF_UP_PEER_COMING. This event will not change the node FSM state, but the (distinct) link FSM will move to state RESETTING. 2: As an effect of the previous event, the local endpoint on B will declare node A lost, and post the event SELF_DOWN to the its node FSM. This moves the FSM state to SELF_DOWN_PEER_LEAVING, meaning that no messages will be accepted from A until it receives another RESET message that confirms that A's endpoint has been reset. This is wasteful, since we know this as a fact already from the first received RESET, but worse is that the link instance's FSM has not wasted this information, but instead moved on to state ESTABLISHING, meaning that it repeatedly sends out ACTIVATE messages to the reset peer A. 3: Node A will receive one of the ACTIVATE messages, move its link FSM to state ESTABLISHED, and start repeatedly sending out STATE messages to node B. 4: Node B will consistently drop these messages, since it can only accept accept a RESET according to its node FSM. 5: After four lost STATE messages node A will reset its link and start repeatedly sending out RESET messages to B. 6: Because of the reduced send rate for RESET messages, it is very likely that A will receive an ACTIVATE (which is sent out at a much higher frequency) before it gets the chance to send a RESET, and A may hence quickly move back to state ESTABLISHED and continue sending out STATE messages, which will again be dropped by B. 7: GOTO 5. 8: After having repeated the cycle 5-7 a number of times, node A will by chance get in between with sending a RESET, and the situation is resolved. Unfortunately, we have seen that it may take a substantial amount of time before this vicious loop is broken, sometimes in the order of minutes. We correct this by making a small correction to the node FSM: When a node in state SELF_UP_PEER_COMING receives a SELF_DOWN event, it now moves directly back to state SELF_DOWN_PEER_DOWN, instead of as now SELF_DOWN_PEER_LEAVING. This is logically consistent, since we don't need to wait for RESET confirmation from of an endpoint that we alread know has been reset. It also means that node B in the scenario above will not be dropping incoming STATE messages, and the link can come up immediately. Finally, a symmetry comparison reveals that the FSM has a similar error when receiving the event PEER_DOWN in state PEER_UP_SELF_COMING. Instead of moving to PERR_DOWN_SELF_LEAVING, it should move directly to SELF_DOWN_PEER_DOWN. Although we have never seen any negative effect of this logical error, we choose fix this one, too. The node FSM looks as follows after those changes: +----------------------------------------+ | PEER_DOWN_EVT| | | +------------------------+----------------+ | |SELF_DOWN_EVT | | | | | | | | +-----------+ +-----------+ | | |NODE_ | |NODE_ | | | +----------|FAILINGOVER|<---------|SYNCHING |-----------+ | | |SELF_ +-----------+ FAILOVER_+-----------+ PEER_ | | | |DOWN_EVT | A BEGIN_EVT A | DOWN_EVT| | | | | | | | | | | | | | | | | | | | |FAILOVER_ |FAILOVER_ |SYNCH_ |SYNCH_ | | | | |END_EVT |BEGIN_EVT |BEGIN_EVT|END_EVT | | | | | | | | | | | | | | | | | | | | | +--------------+ | | | | | +-------->| SELF_UP_ |<-------+ | | | | +-----------------| PEER_UP |----------------+ | | | | |SELF_DOWN_EVT +--------------+ PEER_DOWN_EVT| | | | | | A A | | | | | | | | | | | | | | PEER_UP_EVT| |SELF_UP_EVT | | | | | | | | | | | V V V | | V V V +------------+ +-----------+ +-----------+ +------------+ |SELF_DOWN_ | |SELF_UP_ | |PEER_UP_ | |PEER_DOWN | |PEER_LEAVING| |PEER_COMING| |SELF_COMING| |SELF_LEAVING| +------------+ +-----------+ +-----------+ +------------+ | | A A | | | | | | | | | SELF_ | |SELF_ |PEER_ |PEER_ | | DOWN_EVT| |UP_EVT |UP_EVT |DOWN_EVT | | | | | | | | | | | | | | | +--------------+ | | |PEER_DOWN_EVT +--->| SELF_DOWN_ |<---+ SELF_DOWN_EVT| +------------------->| PEER_DOWN |<--------------------+ +--------------+ Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index e01e2c71b5a1..c7985b2cb759 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -950,7 +950,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) state = SELF_UP_PEER_UP; break; case SELF_LOST_CONTACT_EVT: - state = SELF_DOWN_PEER_LEAVING; + state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: @@ -969,7 +969,7 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) state = SELF_UP_PEER_UP; break; case PEER_LOST_CONTACT_EVT: - state = SELF_LEAVING_PEER_DOWN; + state = SELF_DOWN_PEER_DOWN; break; case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: -- cgit v1.2.3 From 5ca509fc0b6bcfeccf03c8c4bb5e4d1a62720c03 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 8 Jun 2016 12:00:05 -0400 Subject: tipc: change node timer unit from jiffies to ms The node keepalive interval is recalculated at each timer expiration to catch any changes in the link tolerance, and stored in a field in struct tipc_node. We use jiffies as unit for the stored value. This is suboptimal, because it makes the calculation unnecessary complex, including two unit conversions. The conversions also lead to a rounding error that causes the link "abort limit" to be 3 in the normal case, instead of 4, as intended. This again leads to unnecessary link resets when the network is pushed close to its limit, e.g., in an environment with hundreds of nodes or namesapces. In this commit, we do instead let the keepalive value be calculated and stored in milliseconds, so that there is only one conversion and the rounding error is eliminated. We also remove a redundant "keepalive" field in struct tipc_link. This is remnant from the previous implementation. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 2 -- net/tipc/node.c | 18 ++++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 7059c94f33c5..a904ccd5a93a 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -87,7 +87,6 @@ struct tipc_stats { * @peer_bearer_id: bearer id used by link's peer endpoint * @bearer_id: local bearer id used by link * @tolerance: minimum link continuity loss needed to reset link [in ms] - * @keepalive_intv: link keepalive timer interval * @abort_limit: # of unacknowledged continuity probes needed to reset link * @state: current state of link FSM * @peer_caps: bitmap describing capabilities of peer node @@ -131,7 +130,6 @@ struct tipc_link { u32 peer_bearer_id; u32 bearer_id; u32 tolerance; - unsigned long keepalive_intv; u32 abort_limit; u32 state; u16 peer_caps; diff --git a/net/tipc/node.c b/net/tipc/node.c index c7985b2cb759..d6a490f991a4 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -378,14 +378,13 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) { unsigned long tol = tipc_link_tolerance(l); unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; - unsigned long keepalive_intv = msecs_to_jiffies(intv); /* Link with lowest tolerance determines timer interval */ - if (keepalive_intv < n->keepalive_intv) - n->keepalive_intv = keepalive_intv; + if (intv < n->keepalive_intv) + n->keepalive_intv = intv; - /* Ensure link's abort limit corresponds to current interval */ - tipc_link_set_abort_limit(l, tol / jiffies_to_msecs(n->keepalive_intv)); + /* Ensure link's abort limit corresponds to current tolerance */ + tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } static void tipc_node_delete(struct tipc_node *node) @@ -526,7 +525,7 @@ static void tipc_node_timeout(unsigned long data) if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } - mod_timer(&n->timer, jiffies + n->keepalive_intv); + mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv)); } /** @@ -735,6 +734,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, bool accept_addr = false; bool reset = true; char *if_name; + unsigned long intv; *dupl_addr = false; *respond = false; @@ -840,9 +840,11 @@ void tipc_node_check_dest(struct net *net, u32 onode, le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); - if (n->link_cnt == 1) - if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) + if (n->link_cnt == 1) { + intv = jiffies + msecs_to_jiffies(n->keepalive_intv); + if (!mod_timer(&n->timer, intv)) tipc_node_get(n); + } } memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: -- cgit v1.2.3 From 96c63fa7393d0a346acfe5a91e0c7d4c7782641b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 8 Jun 2016 10:55:39 -0700 Subject: net: Add l3mdev rule Currently, VRFs require 1 oif and 1 iif rule per address family per VRF. As the number of VRF devices increases it brings scalability issues with the increasing rule list. All of the VRF rules have the same format with the exception of the specific table id to direct the lookup. Since the table id is available from the oif or iif in the loopup, the VRF rules can be consolidated to a single rule that pulls the table from the VRF device. This patch introduces a new rule attribute l3mdev. The l3mdev rule means the table id used for the lookup is pulled from the L3 master device (e.g., VRF) rather than being statically defined. With the l3mdev rule all of the basic VRF FIB rules are reduced to 1 l3mdev rule per address family (IPv4 and IPv6). If an admin wishes to insert higher priority rules for specific VRFs those rules will co-exist with the l3mdev rule. This capability means current VRF scripts will co-exist with this new simpler implementation. Currently, the rules list for both ipv4 and ipv6 look like this: $ ip ru ls 1000: from all oif vrf1 lookup 1001 1000: from all iif vrf1 lookup 1001 1000: from all oif vrf2 lookup 1002 1000: from all iif vrf2 lookup 1002 1000: from all oif vrf3 lookup 1003 1000: from all iif vrf3 lookup 1003 1000: from all oif vrf4 lookup 1004 1000: from all iif vrf4 lookup 1004 1000: from all oif vrf5 lookup 1005 1000: from all iif vrf5 lookup 1005 1000: from all oif vrf6 lookup 1006 1000: from all iif vrf6 lookup 1006 1000: from all oif vrf7 lookup 1007 1000: from all iif vrf7 lookup 1007 1000: from all oif vrf8 lookup 1008 1000: from all iif vrf8 lookup 1008 ... 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default With the l3mdev rule the list is just the following regardless of the number of VRFs: $ ip ru ls 1000: from all lookup [l3mdev table] 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default (Note: the above pretty print of the rule is based on an iproute2 prototype. Actual verbage may change) Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/fib_rules.c | 33 ++++++++++++++++++++++++++++----- net/ipv4/fib_rules.c | 6 ++++-- net/ipv6/fib6_rules.c | 6 ++++-- net/l3mdev/l3mdev.c | 38 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 840acebbb80c..98298b11f534 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -173,7 +173,8 @@ void fib_rules_unregister(struct fib_rules_ops *ops) EXPORT_SYMBOL_GPL(fib_rules_unregister); static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, - struct flowi *fl, int flags) + struct flowi *fl, int flags, + struct fib_lookup_arg *arg) { int ret = 0; @@ -189,6 +190,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; + if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -204,7 +208,7 @@ int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: - if (!fib_rule_match(rule, ops, fl, flags)) + if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { @@ -265,7 +269,7 @@ errout: return err; } -static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) +int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -336,6 +340,14 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (tb[FRA_TUN_ID]) rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + if (tb[FRA_L3MDEV]) { +#ifdef CONFIG_NET_L3_MASTER_DEV + rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]); + if (rule->l3mdev != 1) +#endif + goto errout_free; + } + rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); @@ -371,6 +383,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) } else if (rule->action == FR_ACT_GOTO) goto errout_free; + if (rule->l3mdev && rule->table) + goto errout_free; + err = ops->configure(rule, skb, frh, tb); if (err < 0) goto errout_free; @@ -424,8 +439,9 @@ errout: rules_ops_put(ops); return err; } +EXPORT_SYMBOL_GPL(fib_nl_newrule); -static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) +int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -483,6 +499,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) continue; + if (tb[FRA_L3MDEV] && + (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV]))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -536,6 +556,7 @@ errout: rules_ops_put(ops); return err; } +EXPORT_SYMBOL_GPL(fib_nl_delrule); static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) @@ -607,7 +628,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && - nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD))) + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || + (rule->l3mdev && + nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f2bda9e89c61..6e9ea69e5f75 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -76,6 +76,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, { int err = -EAGAIN; struct fib_table *tbl; + u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: @@ -94,7 +95,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, rcu_read_lock(); - tbl = fib_get_table(rule->fr_net, rule->table); + tb_id = fib_rule_get_table(rule, arg); + tbl = fib_get_table(rule->fr_net, tb_id); if (tbl) err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *)arg->result, @@ -180,7 +182,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (err) goto errout; - if (rule->table == RT_TABLE_UNSPEC) { + if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) { if (rule->action == FR_ACT_TO_TBL) { struct fib_table *table; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index ed33abf57abd..5857c1fc8b67 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -67,6 +67,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, struct net *net = rule->fr_net; pol_lookup_t lookup = arg->lookup_ptr; int err = 0; + u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: @@ -86,7 +87,8 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto discard_pkt; } - table = fib6_get_table(net, rule->table); + tb_id = fib_rule_get_table(rule, arg); + table = fib6_get_table(net, tb_id); if (!table) { err = -EAGAIN; goto out; @@ -199,7 +201,7 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct net *net = sock_net(skb->sk); struct fib6_rule *rule6 = (struct fib6_rule *) rule; - if (rule->action == FR_ACT_TO_TBL) { + if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) goto errout; diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 6651a78e100c..7da97809a7e8 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -10,6 +10,7 @@ */ #include +#include #include /** @@ -160,3 +161,40 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4) return rc; } EXPORT_SYMBOL_GPL(l3mdev_get_saddr); + +/** + * l3mdev_fib_rule_match - Determine if flowi references an + * L3 master device + * @net: network namespace for device index lookup + * @fl: flow struct + */ + +int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, + struct fib_lookup_arg *arg) +{ + struct net_device *dev; + int rc = 0; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, fl->flowi_oif); + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_fib_table) { + arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); + rc = 1; + goto out; + } + + dev = dev_get_by_index_rcu(net, fl->flowi_iif); + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_fib_table) { + arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); + rc = 1; + goto out; + } + +out: + rcu_read_unlock(); + + return rc; +} -- cgit v1.2.3 From 76e48f9fbe3b0d1279868eef0543725577525e97 Mon Sep 17 00:00:00 2001 From: Shweta Choudaha Date: Wed, 8 Jun 2016 20:15:43 +0100 Subject: ip6gre: Allow live link address change The ip6 GRE tap device should not be forced to down state to change the mac address and should allow live address change for tap device similar to ipv4 gre. Signed-off-by: Shweta Choudaha Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f4ac2842d4d9..fdc9de276ab1 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1256,6 +1256,8 @@ static int ip6gre_tap_init(struct net_device *dev) if (ret) return ret; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + tunnel = netdev_priv(dev); ip6gre_tnl_link_config(tunnel, 1); @@ -1289,6 +1291,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; } static bool ip6gre_netlink_encap_parms(struct nlattr *data[], -- cgit v1.2.3 From c3498d34dd369115a06e5bb862b90b06fde3d114 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 9 Jun 2016 00:27:39 +0200 Subject: cbq: remove TCA_CBQ_OVL_STRATEGY support since initial revision of cbq in 2004 iproute 2 has never implemented support for TCA_CBQ_OVL_STRATEGY, which is what needs to be set to activate the class->drop() call (TC_CBQ_OVL_DROP strategy must be set by userspace value must be set by userspace). David Miller says: It seems really safe to kill this thing off, flag an error if someone tries to set the attribute, and therefore kill off all of the non-default cbq_ovl_*() functions. A followup commit can then remove all .drop qdisc methods since this removed the only caller. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 166 ++-------------------------------------------------- 1 file changed, 6 insertions(+), 160 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 1b8128fb845d..fdca45e34230 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -80,7 +80,6 @@ struct cbq_class { unsigned char priority; /* class priority */ unsigned char priority2; /* priority to be used after overlimit */ unsigned char ewma_log; /* time constant for idle time calculation */ - unsigned char ovl_strategy; #ifdef CONFIG_NET_CLS_ACT unsigned char police; #endif @@ -94,10 +93,6 @@ struct cbq_class { u32 avpkt; struct qdisc_rate_table *R_tab; - /* Overlimit strategy parameters */ - void (*overlimit)(struct cbq_class *cl); - psched_tdiff_t penalty; - /* General scheduler (WRR) parameters */ long allot; long quantum; /* Allotment per WRR round */ @@ -402,11 +397,8 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) return ret; } -/* Overlimit actions */ - -/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ - -static void cbq_ovl_classic(struct cbq_class *cl) +/* Overlimit action: penalize leaf class by adding offtime */ +static void cbq_overlimit(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); psched_tdiff_t delay = cl->undertime - q->now; @@ -456,99 +448,6 @@ static void cbq_ovl_classic(struct cbq_class *cl) } } -/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when - * they go overlimit - */ - -static void cbq_ovl_rclassic(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - struct cbq_class *this = cl; - - do { - if (cl->level > q->toplevel) { - cl = NULL; - break; - } - } while ((cl = cl->borrow) != NULL); - - if (cl == NULL) - cl = this; - cbq_ovl_classic(cl); -} - -/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ - -static void cbq_ovl_delay(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - psched_tdiff_t delay = cl->undertime - q->now; - - if (test_bit(__QDISC_STATE_DEACTIVATED, - &qdisc_root_sleeping(cl->qdisc)->state)) - return; - - if (!cl->delayed) { - psched_time_t sched = q->now; - ktime_t expires; - - delay += cl->offtime; - if (cl->avgidle < 0) - delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); - if (cl->avgidle < cl->minidle) - cl->avgidle = cl->minidle; - cl->undertime = q->now + delay; - - if (delay > 0) { - sched += delay + cl->penalty; - cl->penalized = sched; - cl->cpriority = TC_CBQ_MAXPRIO; - q->pmask |= (1<delay_timer) && - ktime_to_ns(ktime_sub( - hrtimer_get_expires(&q->delay_timer), - expires)) > 0) - hrtimer_set_expires(&q->delay_timer, expires); - hrtimer_restart(&q->delay_timer); - cl->delayed = 1; - cl->xstats.overactions++; - return; - } - delay = 1; - } - if (q->wd_expires == 0 || q->wd_expires > delay) - q->wd_expires = delay; -} - -/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ - -static void cbq_ovl_lowprio(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - - cl->penalized = q->now + cl->penalty; - - if (cl->cpriority != cl->priority2) { - cl->cpriority = cl->priority2; - q->pmask |= (1<cpriority); - cl->xstats.overactions++; - } - cbq_ovl_classic(cl); -} - -/* TC_CBQ_OVL_DROP: penalize class by dropping */ - -static void cbq_ovl_drop(struct cbq_class *cl) -{ - if (cl->q->ops->drop) - if (cl->q->ops->drop(cl->q)) - cl->qdisc->q.qlen--; - cl->xstats.overactions++; - cbq_ovl_classic(cl); -} - static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio, psched_time_t now) { @@ -807,7 +706,7 @@ cbq_under_limit(struct cbq_class *cl) cl = cl->borrow; if (!cl) { this_cl->qstats.overlimits++; - this_cl->overlimit(this_cl); + cbq_overlimit(this_cl); return NULL; } if (cl->level > q->toplevel) @@ -1280,35 +1179,6 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) return 0; } -static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) -{ - switch (ovl->strategy) { - case TC_CBQ_OVL_CLASSIC: - cl->overlimit = cbq_ovl_classic; - break; - case TC_CBQ_OVL_DELAY: - cl->overlimit = cbq_ovl_delay; - break; - case TC_CBQ_OVL_LOWPRIO: - if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO || - ovl->priority2 - 1 <= cl->priority) - return -EINVAL; - cl->priority2 = ovl->priority2 - 1; - cl->overlimit = cbq_ovl_lowprio; - break; - case TC_CBQ_OVL_DROP: - cl->overlimit = cbq_ovl_drop; - break; - case TC_CBQ_OVL_RCLASSIC: - cl->overlimit = cbq_ovl_rclassic; - break; - default: - return -EINVAL; - } - cl->penalty = ovl->penalty; - return 0; -} - #ifdef CONFIG_NET_CLS_ACT static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) { @@ -1375,8 +1245,6 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) q->link.priority = TC_CBQ_MAXPRIO - 1; q->link.priority2 = TC_CBQ_MAXPRIO - 1; q->link.cpriority = TC_CBQ_MAXPRIO - 1; - q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; - q->link.overlimit = cbq_ovl_classic; q->link.allot = psched_mtu(qdisc_dev(sch)); q->link.quantum = q->link.allot; q->link.weight = q->link.R_tab->rate.rate; @@ -1463,24 +1331,6 @@ nla_put_failure: return -1; } -static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_ovl opt; - - opt.strategy = cl->ovl_strategy; - opt.priority2 = cl->priority2 + 1; - opt.pad = 0; - opt.penalty = cl->penalty; - if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt)) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) { unsigned char *b = skb_tail_pointer(skb); @@ -1526,7 +1376,6 @@ static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) if (cbq_dump_lss(skb, cl) < 0 || cbq_dump_rate(skb, cl) < 0 || cbq_dump_wrr(skb, cl) < 0 || - cbq_dump_ovl(skb, cl) < 0 || #ifdef CONFIG_NET_CLS_ACT cbq_dump_police(skb, cl) < 0 || #endif @@ -1736,6 +1585,9 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (err < 0) return err; + if (tb[TCA_CBQ_OVL_STRATEGY]) + return -EOPNOTSUPP; + if (cl) { /* Check parent */ if (parentid) { @@ -1784,9 +1636,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); } - if (tb[TCA_CBQ_OVL_STRATEGY]) - cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); - #ifdef CONFIG_NET_CLS_ACT if (tb[TCA_CBQ_POLICE]) cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); @@ -1887,9 +1736,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cl->maxidle = q->link.maxidle; if (cl->avpkt == 0) cl->avpkt = q->link.avpkt; - cl->overlimit = cbq_ovl_classic; - if (tb[TCA_CBQ_OVL_STRATEGY]) - cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); #ifdef CONFIG_NET_CLS_ACT if (tb[TCA_CBQ_POLICE]) cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); -- cgit v1.2.3 From dd47c1fa776cda48531b651c88341e951140b0a7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 9 Jun 2016 00:27:40 +0200 Subject: cbq: remove TCA_CBQ_POLICE support iproute2 doesn't implement any cbq option that results in this attribute being sent to kernel. To make use of it, user would have to - patch iproute2 - add a class - attach a qdisc to the class (default pfifo doesn't work as q->handle is 0 and cbq_set_police() is a no-op in this case) - re-'add' the same class (tc class change ...) again - user must also specifiy a defmap (e.g. 'split 1:0 defmap 3f'), since this 'police' feature relies on its presence - the added qdisc must be one of bfifo, pfifo or netem If all of these conditions are met and _some_ leaf qdiscs, namely p/bfifo, netem, plug or tbf would drop a packet, kernel calls back into cbq, which will attempt to re-queue the skb into a different class as indicated by the parents' defmap entry for TC_PRIO_BESTEFFORT. [ i.e. we behave as if tc_classify returned TC_ACT_RECLASSIFY ]. This feature, which isn't documented or implemented in iproute2, and isn't implemented consistently (most qdiscs like sfq, codel, etc drop right away instead of attempting this reclassification) is the sole reason for the reshape_fail and __parent member in Qdisc struct. So remove TCA_CBQ_POLICE support from the kernel, reject it via EOPNOTSUPP so userspace knows we don't support it, and then remove no-longer needed infrastructure in followup commit. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 95 +---------------------------------------------------- 1 file changed, 1 insertion(+), 94 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index fdca45e34230..7f4d6c5a0efe 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -80,9 +80,6 @@ struct cbq_class { unsigned char priority; /* class priority */ unsigned char priority2; /* priority to be used after overlimit */ unsigned char ewma_log; /* time constant for idle time calculation */ -#ifdef CONFIG_NET_CLS_ACT - unsigned char police; -#endif u32 defmap; @@ -377,9 +374,6 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) return ret; } -#ifdef CONFIG_NET_CLS_ACT - cl->q->__parent = sch; -#endif ret = qdisc_enqueue(skb, cl->q); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; @@ -524,40 +518,6 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) return HRTIMER_NORESTART; } -#ifdef CONFIG_NET_CLS_ACT -static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) -{ - struct Qdisc *sch = child->__parent; - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = q->rx_class; - - q->rx_class = NULL; - - if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { - int ret; - - cbq_mark_toplevel(q, cl); - - q->rx_class = cl; - cl->q->__parent = sch; - - ret = qdisc_enqueue(skb, cl->q); - if (ret == NET_XMIT_SUCCESS) { - sch->q.qlen++; - if (!cl->next_alive) - cbq_activate_class(cl); - return 0; - } - if (net_xmit_drop_count(ret)) - qdisc_qstats_drop(sch); - return 0; - } - - qdisc_qstats_drop(sch); - return -1; -} -#endif - /* * It is mission critical procedure. * @@ -1179,21 +1139,6 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) return 0; } -#ifdef CONFIG_NET_CLS_ACT -static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) -{ - cl->police = p->police; - - if (cl->q->handle) { - if (p->police == TC_POLICE_RECLASSIFY) - cl->q->reshape_fail = cbq_reshape_fail; - else - cl->q->reshape_fail = NULL; - } - return 0; -} -#endif - static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) { cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); @@ -1350,35 +1295,11 @@ nla_put_failure: return -1; } -#ifdef CONFIG_NET_CLS_ACT -static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_police opt; - - if (cl->police) { - opt.police = cl->police; - opt.__res1 = 0; - opt.__res2 = 0; - if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt)) - goto nla_put_failure; - } - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} -#endif - static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) { if (cbq_dump_lss(skb, cl) < 0 || cbq_dump_rate(skb, cl) < 0 || cbq_dump_wrr(skb, cl) < 0 || -#ifdef CONFIG_NET_CLS_ACT - cbq_dump_police(skb, cl) < 0 || -#endif cbq_dump_fopt(skb, cl) < 0) return -1; return 0; @@ -1468,11 +1389,6 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, &pfifo_qdisc_ops, cl->common.classid); if (new == NULL) return -ENOBUFS; - } else { -#ifdef CONFIG_NET_CLS_ACT - if (cl->police == TC_POLICE_RECLASSIFY) - new->reshape_fail = cbq_reshape_fail; -#endif } *old = qdisc_replace(sch, new, &cl->q); @@ -1585,7 +1501,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (err < 0) return err; - if (tb[TCA_CBQ_OVL_STRATEGY]) + if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) return -EOPNOTSUPP; if (cl) { @@ -1636,11 +1552,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); } -#ifdef CONFIG_NET_CLS_ACT - if (tb[TCA_CBQ_POLICE]) - cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); -#endif - if (tb[TCA_CBQ_FOPT]) cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); @@ -1736,10 +1647,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cl->maxidle = q->link.maxidle; if (cl->avpkt == 0) cl->avpkt = q->link.avpkt; -#ifdef CONFIG_NET_CLS_ACT - if (tb[TCA_CBQ_POLICE]) - cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); -#endif if (tb[TCA_CBQ_FOPT]) cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); sch_tree_unlock(sch); -- cgit v1.2.3 From c3a173d7dba2d7c74dd4ab871b8f22bf56ac10b2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 9 Jun 2016 00:27:41 +0200 Subject: sched: remove qdisc_rehape_fail After the removal of TCA_CBQ_POLICE in cbq scheduler qdisc->reshape_fail is always NULL, i.e. qdisc_rehape_fail is now the same as qdisc_drop. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/sched/sch_fifo.c | 4 ++-- net/sched/sch_netem.c | 4 ++-- net/sched/sch_plug.c | 2 +- net/sched/sch_tbf.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 2177eac0a61e..7857f748a6e4 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -24,7 +24,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) return qdisc_enqueue_tail(skb, sch); - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); } static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) @@ -32,7 +32,7 @@ static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (likely(skb_queue_len(&sch->q) < sch->limit)) return qdisc_enqueue_tail(skb, sch); - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); } static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 205bed00dd34..31984c708382 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -407,7 +407,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { - qdisc_reshape_fail(skb, sch); + qdisc_drop(skb, sch); return NULL; } consume_skb(skb); @@ -499,7 +499,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); qdisc_qstats_backlog_inc(sch, skb); diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index 5abfe44678d4..ff0d968750df 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -96,7 +96,7 @@ static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) return qdisc_enqueue_tail(skb, sch); } - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); } static struct sk_buff *plug_dequeue(struct Qdisc *sch) diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 83b90b584fae..801148c8b6ac 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -166,7 +166,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); nb = 0; while (segs) { @@ -198,7 +198,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (qdisc_pkt_len(skb) > q->max_size) { if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) return tbf_segment(skb, sch); - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); } ret = qdisc_enqueue(skb, q->qdisc); if (ret != NET_XMIT_SUCCESS) { -- cgit v1.2.3 From a09ceb0e08140a1eec05b49b4c232d3481339cb0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 9 Jun 2016 00:27:42 +0200 Subject: sched: remove qdisc->drop after removal of TCA_CBQ_OVL_STRATEGY from cbq scheduler, there are no more callers of ->drop() outside of other ->drop functions, i.e. nothing calls them. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/sched/sch_atm.c | 15 --------------- net/sched/sch_cbq.c | 26 -------------------------- net/sched/sch_choke.c | 17 ----------------- net/sched/sch_drr.c | 21 --------------------- net/sched/sch_dsmark.c | 18 ------------------ net/sched/sch_fifo.c | 3 --- net/sched/sch_fq_codel.c | 10 ---------- net/sched/sch_gred.c | 35 ----------------------------------- net/sched/sch_hfsc.c | 26 -------------------------- net/sched/sch_hhf.c | 10 ---------- net/sched/sch_htb.c | 26 -------------------------- net/sched/sch_multiq.c | 22 ---------------------- net/sched/sch_netem.c | 30 ------------------------------ net/sched/sch_prio.c | 19 ------------------- net/sched/sch_qfq.c | 47 ----------------------------------------------- net/sched/sch_red.c | 20 -------------------- net/sched/sch_sfq.c | 1 - net/sched/sch_tbf.c | 13 ------------- 18 files changed, 359 deletions(-) (limited to 'net') diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 34f8f79e56d5..7e6c12dfc66a 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -519,20 +519,6 @@ static struct sk_buff *atm_tc_peek(struct Qdisc *sch) return p->link.q->ops->peek(p->link.q); } -static unsigned int atm_tc_drop(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - unsigned int len; - - pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) { - if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q))) - return len; - } - return 0; -} - static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) { struct atm_qdisc_data *p = qdisc_priv(sch); @@ -672,7 +658,6 @@ static struct Qdisc_ops atm_qdisc_ops __read_mostly = { .enqueue = atm_tc_enqueue, .dequeue = atm_tc_dequeue, .peek = atm_tc_peek, - .drop = atm_tc_drop, .init = atm_tc_init, .reset = atm_tc_reset, .destroy = atm_tc_destroy, diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 7f4d6c5a0efe..f2af31be6370 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1025,31 +1025,6 @@ static void cbq_link_class(struct cbq_class *this) } } -static unsigned int cbq_drop(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl, *cl_head; - int prio; - unsigned int len; - - for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { - cl_head = q->active[prio]; - if (!cl_head) - continue; - - cl = cl_head; - do { - if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { - sch->q.qlen--; - if (!cl->q->q.qlen) - cbq_deactivate_class(cl); - return len; - } - } while ((cl = cl->next_alive) != cl_head); - } - return 0; -} - static void cbq_reset(struct Qdisc *sch) { @@ -1791,7 +1766,6 @@ static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { .enqueue = cbq_enqueue, .dequeue = cbq_dequeue, .peek = qdisc_peek_dequeued, - .drop = cbq_drop, .init = cbq_init, .reset = cbq_reset, .destroy = cbq_destroy, diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 0a08c860eee4..04e0b0583e00 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -365,22 +365,6 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch) return skb; } -static unsigned int choke_drop(struct Qdisc *sch) -{ - struct choke_sched_data *q = qdisc_priv(sch); - unsigned int len; - - len = qdisc_queue_drop(sch); - if (len > 0) - q->stats.other++; - else { - if (!red_is_idling(&q->vars)) - red_start_of_idle_period(&q->vars); - } - - return len; -} - static void choke_reset(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); @@ -569,7 +553,6 @@ static struct Qdisc_ops choke_qdisc_ops __read_mostly = { .enqueue = choke_enqueue, .dequeue = choke_dequeue, .peek = choke_peek_head, - .drop = choke_drop, .init = choke_init, .destroy = choke_destroy, .reset = choke_reset, diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 1b7e1a27773d..2bec94cf697c 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -421,26 +421,6 @@ out: return NULL; } -static unsigned int drr_drop(struct Qdisc *sch) -{ - struct drr_sched *q = qdisc_priv(sch); - struct drr_class *cl; - unsigned int len; - - list_for_each_entry(cl, &q->active, alist) { - if (cl->qdisc->ops->drop) { - len = cl->qdisc->ops->drop(cl->qdisc); - if (len > 0) { - sch->q.qlen--; - if (cl->qdisc->q.qlen == 0) - list_del(&cl->alist); - return len; - } - } - } - return 0; -} - static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt) { struct drr_sched *q = qdisc_priv(sch); @@ -509,7 +489,6 @@ static struct Qdisc_ops drr_qdisc_ops __read_mostly = { .enqueue = drr_enqueue, .dequeue = drr_dequeue, .peek = qdisc_peek_dequeued, - .drop = drr_drop, .init = drr_init_qdisc, .reset = drr_reset_qdisc, .destroy = drr_destroy_qdisc, diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 34b4ddaca27c..b9ba5f658528 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -320,23 +320,6 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch) return p->q->ops->peek(p->q); } -static unsigned int dsmark_drop(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - unsigned int len; - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - if (p->q->ops->drop == NULL) - return 0; - - len = p->q->ops->drop(p->q); - if (len) - sch->q.qlen--; - - return len; -} - static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) { struct dsmark_qdisc_data *p = qdisc_priv(sch); @@ -489,7 +472,6 @@ static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = { .enqueue = dsmark_enqueue, .dequeue = dsmark_dequeue, .peek = dsmark_peek, - .drop = dsmark_drop, .init = dsmark_init, .reset = dsmark_reset, .destroy = dsmark_destroy, diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 7857f748a6e4..dea70e3ef0ba 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -99,7 +99,6 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = { .enqueue = pfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, - .drop = qdisc_queue_drop, .init = fifo_init, .reset = qdisc_reset_queue, .change = fifo_init, @@ -114,7 +113,6 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = { .enqueue = bfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, - .drop = qdisc_queue_drop, .init = fifo_init, .reset = qdisc_reset_queue, .change = fifo_init, @@ -129,7 +127,6 @@ struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = { .enqueue = pfifo_tail_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, - .drop = qdisc_queue_drop_head, .init = fifo_init, .reset = qdisc_reset_queue, .change = fifo_init, diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 1daa54237f4e..176a7e2561d7 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -184,15 +184,6 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) return idx; } -static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) -{ - unsigned int prev_backlog; - - prev_backlog = sch->qstats.backlog; - fq_codel_drop(sch, 1U); - return prev_backlog - sch->qstats.backlog; -} - static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); @@ -704,7 +695,6 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { .enqueue = fq_codel_enqueue, .dequeue = fq_codel_dequeue, .peek = qdisc_peek_dequeued, - .drop = fq_codel_qdisc_drop, .init = fq_codel_init, .reset = fq_codel_reset, .destroy = fq_codel_destroy, diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 80105109f756..b5fb63c7be02 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -276,40 +276,6 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch) return NULL; } -static unsigned int gred_drop(struct Qdisc *sch) -{ - struct sk_buff *skb; - struct gred_sched *t = qdisc_priv(sch); - - skb = qdisc_dequeue_tail(sch); - if (skb) { - unsigned int len = qdisc_pkt_len(skb); - struct gred_sched_data *q; - u16 dp = tc_index_to_dp(skb); - - if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { - net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n", - tc_index_to_dp(skb)); - } else { - q->backlog -= len; - q->stats.other++; - - if (gred_wred_mode(t)) { - if (!sch->qstats.backlog) - red_start_of_idle_period(&t->wred_set); - } else { - if (!q->backlog) - red_start_of_idle_period(&q->vars); - } - } - - qdisc_drop(skb, sch); - return len; - } - - return 0; -} - static void gred_reset(struct Qdisc *sch) { int i; @@ -623,7 +589,6 @@ static struct Qdisc_ops gred_qdisc_ops __read_mostly = { .enqueue = gred_enqueue, .dequeue = gred_dequeue, .peek = qdisc_peek_head, - .drop = gred_drop, .init = gred_init, .reset = gred_reset, .destroy = gred_destroy, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 74813dd49053..155e3ce81270 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1677,31 +1677,6 @@ hfsc_dequeue(struct Qdisc *sch) return skb; } -static unsigned int -hfsc_drop(struct Qdisc *sch) -{ - struct hfsc_sched *q = qdisc_priv(sch); - struct hfsc_class *cl; - unsigned int len; - - list_for_each_entry(cl, &q->droplist, dlist) { - if (cl->qdisc->ops->drop != NULL && - (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) { - if (cl->qdisc->q.qlen == 0) { - update_vf(cl, 0, 0); - set_passive(cl); - } else { - list_move_tail(&cl->dlist, &q->droplist); - } - cl->qstats.drops++; - qdisc_qstats_drop(sch); - sch->q.qlen--; - return len; - } - } - return 0; -} - static const struct Qdisc_class_ops hfsc_class_ops = { .change = hfsc_change_class, .delete = hfsc_delete_class, @@ -1728,7 +1703,6 @@ static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = { .enqueue = hfsc_enqueue, .dequeue = hfsc_dequeue, .peek = qdisc_peek_dequeued, - .drop = hfsc_drop, .cl_ops = &hfsc_class_ops, .priv_size = sizeof(struct hfsc_sched), .owner = THIS_MODULE diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 13d6f83ec491..c51791848a38 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -368,15 +368,6 @@ static unsigned int hhf_drop(struct Qdisc *sch) return bucket - q->buckets; } -static unsigned int hhf_qdisc_drop(struct Qdisc *sch) -{ - unsigned int prev_backlog; - - prev_backlog = sch->qstats.backlog; - hhf_drop(sch); - return prev_backlog - sch->qstats.backlog; -} - static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); @@ -709,7 +700,6 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, - .drop = hhf_qdisc_drop, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 2b057649f24b..b74d06668ab4 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -936,31 +936,6 @@ fin: return skb; } -/* try to drop from each class (by prio) until one succeed */ -static unsigned int htb_drop(struct Qdisc *sch) -{ - struct htb_sched *q = qdisc_priv(sch); - int prio; - - for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { - struct list_head *p; - list_for_each(p, q->drops + prio) { - struct htb_class *cl = list_entry(p, struct htb_class, - un.leaf.drop_list); - unsigned int len; - if (cl->un.leaf.q->ops->drop && - (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { - sch->qstats.backlog -= len; - sch->q.qlen--; - if (!cl->un.leaf.q->q.qlen) - htb_deactivate(q, cl); - return len; - } - } - } - return 0; -} - /* reset all classes */ /* always caled under BH & queue lock */ static void htb_reset(struct Qdisc *sch) @@ -1600,7 +1575,6 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = { .enqueue = htb_enqueue, .dequeue = htb_dequeue, .peek = qdisc_peek_dequeued, - .drop = htb_drop, .init = htb_init, .reset = htb_reset, .destroy = htb_destroy, diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 21e69d2e8347..5ea93305d705 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -151,27 +151,6 @@ static struct sk_buff *multiq_peek(struct Qdisc *sch) } -static unsigned int multiq_drop(struct Qdisc *sch) -{ - struct multiq_sched_data *q = qdisc_priv(sch); - int band; - unsigned int len; - struct Qdisc *qdisc; - - for (band = q->bands - 1; band >= 0; band--) { - qdisc = q->queues[band]; - if (qdisc->ops->drop) { - len = qdisc->ops->drop(qdisc); - if (len != 0) { - sch->q.qlen--; - return len; - } - } - } - return 0; -} - - static void multiq_reset(struct Qdisc *sch) { @@ -416,7 +395,6 @@ static struct Qdisc_ops multiq_qdisc_ops __read_mostly = { .enqueue = multiq_enqueue, .dequeue = multiq_dequeue, .peek = multiq_peek, - .drop = multiq_drop, .init = multiq_init, .reset = multiq_reset, .destroy = multiq_destroy, diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 31984c708382..9ca7947ab643 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -576,35 +576,6 @@ finish_segs: return NET_XMIT_SUCCESS; } -static unsigned int netem_drop(struct Qdisc *sch) -{ - struct netem_sched_data *q = qdisc_priv(sch); - unsigned int len; - - len = qdisc_queue_drop(sch); - - if (!len) { - struct rb_node *p = rb_first(&q->t_root); - - if (p) { - struct sk_buff *skb = netem_rb_to_skb(p); - - rb_erase(p, &q->t_root); - sch->q.qlen--; - skb->next = NULL; - skb->prev = NULL; - qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); - } - } - if (!len && q->qdisc && q->qdisc->ops->drop) - len = q->qdisc->ops->drop(q->qdisc); - if (len) - qdisc_qstats_drop(sch); - - return len; -} - static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); @@ -1143,7 +1114,6 @@ static struct Qdisc_ops netem_qdisc_ops __read_mostly = { .enqueue = netem_enqueue, .dequeue = netem_dequeue, .peek = qdisc_peek_dequeued, - .drop = netem_drop, .init = netem_init, .reset = netem_reset, .destroy = netem_destroy, diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 06eca7060683..4c77780b55c3 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -125,24 +125,6 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch) } -static unsigned int prio_drop(struct Qdisc *sch) -{ - struct prio_sched_data *q = qdisc_priv(sch); - int prio; - unsigned int len; - struct Qdisc *qdisc; - - for (prio = q->bands-1; prio >= 0; prio--) { - qdisc = q->queues[prio]; - if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) { - sch->q.qlen--; - return len; - } - } - return 0; -} - - static void prio_reset(struct Qdisc *sch) { @@ -379,7 +361,6 @@ static struct Qdisc_ops prio_qdisc_ops __read_mostly = { .enqueue = prio_enqueue, .dequeue = prio_dequeue, .peek = prio_peek, - .drop = prio_drop, .init = prio_init, .reset = prio_reset, .destroy = prio_destroy, diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 85d41979d825..cdb3966572f9 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1423,52 +1423,6 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) qfq_deactivate_class(q, cl); } -static unsigned int qfq_drop_from_slot(struct qfq_sched *q, - struct hlist_head *slot) -{ - struct qfq_aggregate *agg; - struct qfq_class *cl; - unsigned int len; - - hlist_for_each_entry(agg, slot, next) { - list_for_each_entry(cl, &agg->active, alist) { - - if (!cl->qdisc->ops->drop) - continue; - - len = cl->qdisc->ops->drop(cl->qdisc); - if (len > 0) { - if (cl->qdisc->q.qlen == 0) - qfq_deactivate_class(q, cl); - - return len; - } - } - } - return 0; -} - -static unsigned int qfq_drop(struct Qdisc *sch) -{ - struct qfq_sched *q = qdisc_priv(sch); - struct qfq_group *grp; - unsigned int i, j, len; - - for (i = 0; i <= QFQ_MAX_INDEX; i++) { - grp = &q->groups[i]; - for (j = 0; j < QFQ_MAX_SLOTS; j++) { - len = qfq_drop_from_slot(q, &grp->slots[j]); - if (len > 0) { - sch->q.qlen--; - return len; - } - } - - } - - return 0; -} - static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) { struct qfq_sched *q = qdisc_priv(sch); @@ -1563,7 +1517,6 @@ static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { .enqueue = qfq_enqueue, .dequeue = qfq_dequeue, .peek = qdisc_peek_dequeued, - .drop = qfq_drop, .init = qfq_init_qdisc, .reset = qfq_reset_qdisc, .destroy = qfq_destroy_qdisc, diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 8c0508c0e287..235942f3464e 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -134,25 +134,6 @@ static struct sk_buff *red_peek(struct Qdisc *sch) return child->ops->peek(child); } -static unsigned int red_drop(struct Qdisc *sch) -{ - struct red_sched_data *q = qdisc_priv(sch); - struct Qdisc *child = q->qdisc; - unsigned int len; - - if (child->ops->drop && (len = child->ops->drop(child)) > 0) { - q->stats.other++; - qdisc_qstats_drop(sch); - sch->q.qlen--; - return len; - } - - if (!red_is_idling(&q->vars)) - red_start_of_idle_period(&q->vars); - - return 0; -} - static void red_reset(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); @@ -361,7 +342,6 @@ static struct Qdisc_ops red_qdisc_ops __read_mostly = { .enqueue = red_enqueue, .dequeue = red_dequeue, .peek = red_peek, - .drop = red_drop, .init = red_init, .reset = red_reset, .destroy = red_destroy, diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 498f0a2cb47f..a2e0b855d1c8 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -896,7 +896,6 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { .enqueue = sfq_enqueue, .dequeue = sfq_dequeue, .peek = qdisc_peek_dequeued, - .drop = sfq_drop, .init = sfq_init, .reset = sfq_reset, .destroy = sfq_destroy, diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 801148c8b6ac..1342e46574b2 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -211,18 +211,6 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } -static unsigned int tbf_drop(struct Qdisc *sch) -{ - struct tbf_sched_data *q = qdisc_priv(sch); - unsigned int len = 0; - - if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { - sch->q.qlen--; - qdisc_qstats_drop(sch); - } - return len; -} - static bool tbf_peak_present(const struct tbf_sched_data *q) { return q->peak.rate_bytes_ps; @@ -555,7 +543,6 @@ static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { .enqueue = tbf_enqueue, .dequeue = tbf_dequeue, .peek = qdisc_peek_dequeued, - .drop = tbf_drop, .init = tbf_init, .reset = tbf_reset, .destroy = tbf_destroy, -- cgit v1.2.3 From 56ab364f17637a6e5a67623ff1d6ed4a505025c2 Mon Sep 17 00:00:00 2001 From: Kirtika Ruchandani Date: Sun, 29 May 2016 19:54:10 -0700 Subject: nl80211: Fix spelling Fix 'implementation' spelling, reported by checkpatch.pl Signed-off-by: Kirtika Ruchandani Reviewed-by: Julian Calaby Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d12044996a0e..cf588f1a4c9f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6552,7 +6552,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, nla_data(ssid), nla_len(ssid)); request->match_sets[i].ssid.ssid_len = nla_len(ssid); - /* special attribute - old implemenation w/a */ + /* special attribute - old implementation w/a */ request->match_sets[i].rssi_thold = default_match_rssi; rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI]; -- cgit v1.2.3 From 7a087e7484c9e40203c89b11aa1508ea8d5d649f Mon Sep 17 00:00:00 2001 From: Kirtika Ruchandani Date: Sun, 29 May 2016 19:51:23 -0700 Subject: nl80211: Fix checkpatch warnings about blank lines This patch fixes the following checkpatch.pl issues - - Please don't use multiple blank lines - Blank lines aren't necessary before a close brace - Missing a blank line after declarations Reviewed-by: Julian Calaby Signed-off-by: Kirtika Ruchandani Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index cf588f1a4c9f..0d7db10c782f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -167,6 +167,7 @@ __cfg80211_rdev_from_attrs(struct net *netns, struct nlattr **attrs) if (attrs[NL80211_ATTR_IFINDEX]) { int ifindex = nla_get_u32(attrs[NL80211_ATTR_IFINDEX]); + netdev = __dev_get_by_index(netns, ifindex); if (netdev) { if (netdev->ieee80211_ptr) @@ -731,6 +732,7 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k) if (tb[NL80211_KEY_DEFAULT_TYPES]) { struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES]; + err = nla_parse_nested(kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1, tb[NL80211_KEY_DEFAULT_TYPES], nl80211_key_default_policy); @@ -1382,6 +1384,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->ops->get_antenna) { u32 tx_ant = 0, rx_ant = 0; int res; + res = rdev_get_antenna(rdev, &tx_ant, &rx_ant); if (!res) { if (nla_put_u32(msg, @@ -2157,7 +2160,6 @@ static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info) return rdev_set_wds_peer(rdev, dev, bssid); } - static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev; @@ -2292,6 +2294,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] && info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]) { u32 tx_ant, rx_ant; + if ((!rdev->wiphy.available_antennas_tx && !rdev->wiphy.available_antennas_rx) || !rdev->ops->set_antenna) @@ -2960,6 +2963,7 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) pairwise = !!mac_addr; if (info->attrs[NL80211_ATTR_KEY_TYPE]) { u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]); + if (kt >= NUM_NL80211_KEYTYPES) return -EINVAL; if (kt != NL80211_KEYTYPE_GROUP && @@ -4003,7 +4007,6 @@ static int nl80211_dump_station(struct sk_buff *skb, sta_idx++; } - out: cb->args[2] = sta_idx; err = skb->len; @@ -4804,7 +4807,6 @@ static int nl80211_dump_mpath(struct sk_buff *skb, path_idx++; } - out: cb->args[2] = path_idx; err = skb->len; @@ -5094,7 +5096,6 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) enum nl80211_user_reg_hint_type user_reg_hint_type; u32 owner_nlportid; - /* * You should only get this when cfg80211 hasn't yet initialized * completely when built-in to the kernel right between the time @@ -5303,7 +5304,6 @@ do { \ } \ } while (0) - if (!info->attrs[NL80211_ATTR_MESH_CONFIG]) return -EINVAL; if (nla_parse_nested(tb, NL80211_MESHCONF_ATTR_MAX, @@ -5450,7 +5450,6 @@ static int nl80211_parse_mesh_setup(struct genl_info *info, IEEE80211_PATH_METRIC_VENDOR : IEEE80211_PATH_METRIC_AIRTIME; - if (tb[NL80211_MESH_SETUP_IE]) { struct nlattr *ieattr = tb[NL80211_MESH_SETUP_IE]; @@ -6074,6 +6073,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) /* all channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; + if (!wiphy->bands[band]) continue; for (j = 0; j < wiphy->bands[band]->n_channels; j++) { @@ -6483,6 +6483,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, /* all channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; + if (!wiphy->bands[band]) continue; for (j = 0; j < wiphy->bands[band]->n_channels; j++) { @@ -7245,6 +7246,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if (key.idx >= 0) { int i; bool ok = false; + for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) { if (key.p.cipher == rdev->wiphy.cipher_suites[i]) { ok = true; @@ -7323,6 +7325,7 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) { u16 proto; + proto = nla_get_u16( info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]); settings->control_port_ethertype = cpu_to_be16(proto); @@ -8476,6 +8479,7 @@ static u32 rateset_to_mask(struct ieee80211_supported_band *sband, for (i = 0; i < rates_len; i++) { int rate = (rates[i] & 0x7f) * 5; int ridx; + for (ridx = 0; ridx < sband->n_bitrates; ridx++) { struct ieee80211_rate *srate = &sband->bitrates[ridx]; @@ -8784,7 +8788,6 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) if (params.wait < NL80211_MIN_REMAIN_ON_CHANNEL_TIME || params.wait > rdev->wiphy.max_remain_on_channel_duration) return -EINVAL; - } params.offchan = info->attrs[NL80211_ATTR_OFFCHANNEL_TX_OK]; @@ -10631,7 +10634,6 @@ int cfg80211_vendor_cmd_reply(struct sk_buff *skb) } EXPORT_SYMBOL_GPL(cfg80211_vendor_cmd_reply); - static int nl80211_set_qos_map(struct sk_buff *skb, struct genl_info *info) { @@ -12170,7 +12172,6 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, nla_put_failure: genlmsg_cancel(msg, hdr); nlmsg_free(msg); - } void nl80211_send_roamed(struct cfg80211_registered_device *rdev, @@ -12209,7 +12210,6 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, nla_put_failure: genlmsg_cancel(msg, hdr); nlmsg_free(msg); - } void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, @@ -12247,7 +12247,6 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, nla_put_failure: genlmsg_cancel(msg, hdr); nlmsg_free(msg); - } void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, @@ -13589,7 +13588,6 @@ void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp) if (hdr) genlmsg_cancel(msg, hdr); nlmsg_free(msg); - } EXPORT_SYMBOL(cfg80211_crit_proto_stopped); -- cgit v1.2.3 From 0662799023de4be686263b0a4f4b7910999172b9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 9 Jun 2016 10:40:09 +0200 Subject: nl80211: clarify nl80211_set_reg() success path Setting rd to NULL to avoid freeing it, just to be able to return from the function in a single place, doesn't make much sense. Return the set_regdom() return value directly. Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0d7db10c782f..c503e96bfd5a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5836,10 +5836,8 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) } } - r = set_regdom(rd, REGD_SOURCE_CRDA); - /* set_regdom took ownership */ - rd = NULL; - + /* set_regdom takes ownership of rd */ + return set_regdom(rd, REGD_SOURCE_CRDA); bad_reg: kfree(rd); return r; -- cgit v1.2.3 From 80a83cfc434b1e3afe38974570b460db4898bec6 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Thu, 19 May 2016 10:37:48 +0200 Subject: mac80211: skip netdev queue control with software queuing Qdiscs are designed with no regard to 802.11 aggregation requirements and hand out packet-by-packet with no guarantee they are destined to the same tid. This does more bad than good no matter how fairly a given qdisc may behave on an ethernet interface. Software queuing used per-AC netdev subqueue congestion control whenever a global AC limit was hit. This meant in practice a single station or tid queue could starve others rather easily. This could resonate with qdiscs in a bad way or could just end up with poor aggregation performance. Increasing the AC limit would increase induced latency which is also bad. Disabling qdiscs by default and performing taildrop instead of netdev subqueue congestion control on the other hand makes it possible for tid queues to fill up "in the meantime" while preventing stations starving each other. This increases aggregation opportunities and should allow software queuing based drivers achieve better performance by utilizing airtime more efficiently with big aggregates. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/iface.c | 18 +++++++++-- net/mac80211/main.c | 3 -- net/mac80211/sta_info.c | 2 +- net/mac80211/tx.c | 75 ++++++++++++++++++++++++++-------------------- net/mac80211/util.c | 11 +++---- 6 files changed, 66 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9438c9406687..634603320374 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -856,7 +856,7 @@ struct ieee80211_sub_if_data { bool control_port_no_encrypt; int encrypt_headroom; - atomic_t txqs_len[IEEE80211_NUM_ACS]; + atomic_t num_tx_queued; struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS]; struct mac80211_qos_map __rcu *qos_map; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index c59af3eb9fa4..609c5174d798 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -976,13 +976,13 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (sdata->vif.txq) { struct txq_info *txqi = to_txq_info(sdata->vif.txq); + int n = skb_queue_len(&txqi->queue); spin_lock_bh(&txqi->queue.lock); ieee80211_purge_tx_queue(&local->hw, &txqi->queue); + atomic_sub(n, &sdata->num_tx_queued); txqi->byte_cnt = 0; spin_unlock_bh(&txqi->queue.lock); - - atomic_set(&sdata->txqs_len[txqi->txq.ac], 0); } if (local->open_count == 0) @@ -1198,6 +1198,12 @@ static void ieee80211_if_setup(struct net_device *dev) dev->destructor = ieee80211_if_free; } +static void ieee80211_if_setup_no_queue(struct net_device *dev) +{ + ieee80211_if_setup(dev); + dev->priv_flags |= IFF_NO_QUEUE; +} + static void ieee80211_iface_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = @@ -1707,6 +1713,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, struct net_device *ndev = NULL; struct ieee80211_sub_if_data *sdata = NULL; struct txq_info *txqi; + void (*if_setup)(struct net_device *dev); int ret, i; int txqs = 1; @@ -1734,12 +1741,17 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; + if (local->ops->wake_tx_queue) + if_setup = ieee80211_if_setup_no_queue; + else + if_setup = ieee80211_if_setup; + if (local->hw.queues >= IEEE80211_NUM_ACS) txqs = IEEE80211_NUM_ACS; ndev = alloc_netdev_mqs(size + txq_size, name, name_assign_type, - ieee80211_if_setup, txqs, 1); + if_setup, txqs, 1); if (!ndev) return -ENOMEM; dev_net_set(ndev, wiphy_net(local->hw.wiphy)); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 7ee91d6151d1..160ac6b8b9a1 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1055,9 +1055,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->dynamic_ps_forced_timeout = -1; - if (!local->hw.txq_ac_max_pending) - local->hw.txq_ac_max_pending = 64; - result = ieee80211_wep_init(local); if (result < 0) wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 5ccfdbd406bd..177cc6cd6416 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -116,7 +116,7 @@ static void __cleanup_single_sta(struct sta_info *sta) int n = skb_queue_len(&txqi->queue); ieee80211_purge_tx_queue(&local->hw, &txqi->queue); - atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]); + atomic_sub(n, &sdata->num_tx_queued); txqi->byte_cnt = 0; } } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 203044379ce0..3e77da195ce8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1236,27 +1236,21 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, return TX_CONTINUE; } -static void ieee80211_drv_tx(struct ieee80211_local *local, - struct ieee80211_vif *vif, - struct ieee80211_sta *pubsta, - struct sk_buff *skb) +static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_tx_control control = { - .sta = pubsta, - }; struct ieee80211_txq *txq = NULL; - struct txq_info *txqi; - u8 ac; if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) - goto tx_normal; + return NULL; if (!ieee80211_is_data(hdr->frame_control)) - goto tx_normal; + return NULL; if (pubsta) { u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; @@ -1267,25 +1261,28 @@ static void ieee80211_drv_tx(struct ieee80211_local *local, } if (!txq) - goto tx_normal; + return NULL; - ac = txq->ac; - txqi = to_txq_info(txq); - atomic_inc(&sdata->txqs_len[ac]); - if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending) - netif_stop_subqueue(sdata->dev, ac); + return to_txq_info(txq); +} - spin_lock_bh(&txqi->queue.lock); - txqi->byte_cnt += skb->len; - __skb_queue_tail(&txqi->queue, skb); - spin_unlock_bh(&txqi->queue.lock); +static void ieee80211_txq_enqueue(struct ieee80211_local *local, + struct txq_info *txqi, + struct sk_buff *skb) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(txqi->txq.vif); - drv_wake_tx_queue(local, txqi); + lockdep_assert_held(&txqi->queue.lock); - return; + if (atomic_read(&sdata->num_tx_queued) >= TOTAL_MAX_TX_BUFFER || + txqi->queue.qlen >= STA_MAX_TX_BUFFER) { + ieee80211_free_txskb(&local->hw, skb); + return; + } -tx_normal: - drv_tx(local, &control, skb); + atomic_inc(&sdata->num_tx_queued); + txqi->byte_cnt += skb->len; + __skb_queue_tail(&txqi->queue, skb); } struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, @@ -1296,7 +1293,6 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct txq_info *txqi = container_of(txq, struct txq_info, txq); struct ieee80211_hdr *hdr; struct sk_buff *skb = NULL; - u8 ac = txq->ac; spin_lock_bh(&txqi->queue.lock); @@ -1307,12 +1303,9 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, if (!skb) goto out; + atomic_dec(&sdata->num_tx_queued); txqi->byte_cnt -= skb->len; - atomic_dec(&sdata->txqs_len[ac]); - if (__netif_subqueue_stopped(sdata->dev, ac)) - ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]); - hdr = (struct ieee80211_hdr *)skb->data; if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) { struct sta_info *sta = container_of(txq->sta, struct sta_info, @@ -1343,7 +1336,9 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, struct sk_buff_head *skbs, bool txpending) { + struct ieee80211_tx_control control = {}; struct sk_buff *skb, *tmp; + struct txq_info *txqi; unsigned long flags; skb_queue_walk_safe(skbs, skb, tmp) { @@ -1358,6 +1353,21 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, } #endif + txqi = ieee80211_get_txq(local, vif, sta, skb); + if (txqi) { + info->control.vif = vif; + + __skb_unlink(skb, skbs); + + spin_lock_bh(&txqi->queue.lock); + ieee80211_txq_enqueue(local, txqi, skb); + spin_unlock_bh(&txqi->queue.lock); + + drv_wake_tx_queue(local, txqi); + + continue; + } + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { @@ -1400,9 +1410,10 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); info->control.vif = vif; + control.sta = sta; __skb_unlink(skb, skbs); - ieee80211_drv_tx(local, vif, sta, skb); + drv_tx(local, &control, skb); } return true; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 905003f75c4d..0db46442bdcf 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -244,6 +244,9 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) struct ieee80211_sub_if_data *sdata; int n_acs = IEEE80211_NUM_ACS; + if (local->ops->wake_tx_queue) + return; + if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; @@ -260,11 +263,6 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) for (ac = 0; ac < n_acs; ac++) { int ac_queue = sdata->vif.hw_queue[ac]; - if (local->ops->wake_tx_queue && - (atomic_read(&sdata->txqs_len[ac]) > - local->hw.txq_ac_max_pending)) - continue; - if (ac_queue == queue || (sdata->vif.cab_queue == queue && local->queue_stop_reasons[ac_queue] == 0 && @@ -352,6 +350,9 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue])) return; + if (local->ops->wake_tx_queue) + return; + if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; -- cgit v1.2.3 From fa962b92120bb70693a4db545f89067eb3373294 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Thu, 19 May 2016 10:37:49 +0200 Subject: mac80211: implement fair queueing per txq mac80211's software queues were designed to work very closely with device tx queues. They are required to make use of 802.11 packet aggregation easily and efficiently. Due to the way 802.11 aggregation is designed it only makes sense to keep fair queuing as close to hardware as possible to reduce induced latency and inertia and provide the best flow responsiveness. This change doesn't translate directly to immediate and significant gains. End result depends on driver's induced latency. Best results can be achieved if driver keeps its own tx queue/fifo fill level to a minimum. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 8 ++- net/mac80211/ieee80211_i.h | 24 ++++++-- net/mac80211/iface.c | 12 ++-- net/mac80211/main.c | 7 +++ net/mac80211/rx.c | 2 +- net/mac80211/sta_info.c | 14 ++--- net/mac80211/tx.c | 136 ++++++++++++++++++++++++++++++++++++++------- net/mac80211/util.c | 23 +------- 8 files changed, 162 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 42fa81031dfa..5650c46bf91a 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -194,17 +194,21 @@ static void ieee80211_agg_stop_txq(struct sta_info *sta, int tid) { struct ieee80211_txq *txq = sta->sta.txq[tid]; + struct ieee80211_sub_if_data *sdata; + struct fq *fq; struct txq_info *txqi; if (!txq) return; txqi = to_txq_info(txq); + sdata = vif_to_sdata(txq->vif); + fq = &sdata->local->fq; /* Lock here to protect against further seqno updates on dequeue */ - spin_lock_bh(&txqi->queue.lock); + spin_lock_bh(&fq->lock); set_bit(IEEE80211_TXQ_STOP, &txqi->flags); - spin_unlock_bh(&txqi->queue.lock); + spin_unlock_bh(&fq->lock); } static void diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 634603320374..6f8375f1df88 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -30,6 +30,7 @@ #include #include #include +#include #include "key.h" #include "sta_info.h" #include "debug.h" @@ -805,10 +806,17 @@ enum txq_info_flags { IEEE80211_TXQ_NO_AMSDU, }; +/** + * struct txq_info - per tid queue + * + * @tin: contains packets split into multiple flows + * @def_flow: used as a fallback flow when a packet destined to @tin hashes to + * a fq_flow which is already owned by a different tin + */ struct txq_info { - struct sk_buff_head queue; + struct fq_tin tin; + struct fq_flow def_flow; unsigned long flags; - unsigned long byte_cnt; /* keep last! */ struct ieee80211_txq txq; @@ -1099,6 +1107,8 @@ struct ieee80211_local { * it first anyway so they become a no-op */ struct ieee80211_hw hw; + struct fq fq; + const struct ieee80211_ops *ops; /* @@ -1931,9 +1941,13 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local) return true; } -void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, - struct txq_info *txq, int tid); +int ieee80211_txq_setup_flows(struct ieee80211_local *local); +void ieee80211_txq_teardown_flows(struct ieee80211_local *local); +void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct txq_info *txq, int tid); +void ieee80211_txq_purge(struct ieee80211_local *local, + struct txq_info *txqi); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 609c5174d798..b123a9e325b3 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -779,6 +779,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) { struct ieee80211_local *local = sdata->local; + struct fq *fq = &local->fq; unsigned long flags; struct sk_buff *skb, *tmp; u32 hw_reconf_flags = 0; @@ -976,13 +977,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (sdata->vif.txq) { struct txq_info *txqi = to_txq_info(sdata->vif.txq); - int n = skb_queue_len(&txqi->queue); - spin_lock_bh(&txqi->queue.lock); - ieee80211_purge_tx_queue(&local->hw, &txqi->queue); - atomic_sub(n, &sdata->num_tx_queued); - txqi->byte_cnt = 0; - spin_unlock_bh(&txqi->queue.lock); + spin_lock_bh(&fq->lock); + ieee80211_txq_purge(local, txqi); + spin_unlock_bh(&fq->lock); } if (local->open_count == 0) @@ -1792,7 +1790,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, if (txq_size) { txqi = netdev_priv(ndev) + size; - ieee80211_init_tx_queue(sdata, NULL, txqi, 0); + ieee80211_txq_init(sdata, NULL, txqi, 0); } sdata->dev = ndev; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 160ac6b8b9a1..d00ea9b13f49 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1086,6 +1086,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rtnl_unlock(); + result = ieee80211_txq_setup_flows(local); + if (result) + goto fail_flows; + #ifdef CONFIG_INET local->ifa_notifier.notifier_call = ieee80211_ifa_changed; result = register_inetaddr_notifier(&local->ifa_notifier); @@ -1111,6 +1115,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) #if defined(CONFIG_INET) || defined(CONFIG_IPV6) fail_ifa: #endif + ieee80211_txq_teardown_flows(local); + fail_flows: rtnl_lock(); rate_control_deinitialize(local); ieee80211_remove_interfaces(local); @@ -1169,6 +1175,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) skb_queue_purge(&local->skb_queue); skb_queue_purge(&local->skb_queue_unreliable); skb_queue_purge(&local->skb_queue_tdls_chsw); + ieee80211_txq_teardown_flows(local); destroy_workqueue(local->workqueue); wiphy_unregister(local->hw.wiphy); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 5e65e838992a..9a1eb70cb120 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1268,7 +1268,7 @@ static void sta_ps_start(struct sta_info *sta) for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); - if (!skb_queue_len(&txqi->queue)) + if (!txqi->tin.backlog_packets) set_bit(tid, &sta->txq_buffered_tids); else clear_bit(tid, &sta->txq_buffered_tids); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 177cc6cd6416..76b737dcc36f 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -90,6 +90,7 @@ static void __cleanup_single_sta(struct sta_info *sta) struct tid_ampdu_tx *tid_tx; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; + struct fq *fq = &local->fq; struct ps_data *ps; if (test_sta_flag(sta, WLAN_STA_PS_STA) || @@ -113,11 +114,10 @@ static void __cleanup_single_sta(struct sta_info *sta) if (sta->sta.txq[0]) { for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txqi = to_txq_info(sta->sta.txq[i]); - int n = skb_queue_len(&txqi->queue); - ieee80211_purge_tx_queue(&local->hw, &txqi->queue); - atomic_sub(n, &sdata->num_tx_queued); - txqi->byte_cnt = 0; + spin_lock_bh(&fq->lock); + ieee80211_txq_purge(local, txqi); + spin_unlock_bh(&fq->lock); } } @@ -368,7 +368,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txq = txq_data + i * size; - ieee80211_init_tx_queue(sdata, sta, txq, i); + ieee80211_txq_init(sdata, sta, txq, i); } } @@ -1211,7 +1211,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txqi = to_txq_info(sta->sta.txq[i]); - if (!skb_queue_len(&txqi->queue)) + if (!txqi->tin.backlog_packets) continue; drv_wake_tx_queue(local, txqi); @@ -1648,7 +1648,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); - if (!(tids & BIT(tid)) || skb_queue_len(&txqi->queue)) + if (!(tids & BIT(tid)) || txqi->tin.backlog_packets) continue; sta_info_recalc_tim(sta); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3e77da195ce8..1d8343fca6d4 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "ieee80211_i.h" #include "driver-ops.h" @@ -1266,46 +1267,121 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, return to_txq_info(txq); } +static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, + struct fq_tin *tin, + struct fq_flow *flow) +{ + return fq_flow_dequeue(fq, flow); +} + +static void fq_skb_free_func(struct fq *fq, + struct fq_tin *tin, + struct fq_flow *flow, + struct sk_buff *skb) +{ + struct ieee80211_local *local; + + local = container_of(fq, struct ieee80211_local, fq); + ieee80211_free_txskb(&local->hw, skb); +} + +static struct fq_flow *fq_flow_get_default_func(struct fq *fq, + struct fq_tin *tin, + int idx, + struct sk_buff *skb) +{ + struct txq_info *txqi; + + txqi = container_of(tin, struct txq_info, tin); + return &txqi->def_flow; +} + static void ieee80211_txq_enqueue(struct ieee80211_local *local, struct txq_info *txqi, struct sk_buff *skb) { - struct ieee80211_sub_if_data *sdata = vif_to_sdata(txqi->txq.vif); + struct fq *fq = &local->fq; + struct fq_tin *tin = &txqi->tin; - lockdep_assert_held(&txqi->queue.lock); + fq_tin_enqueue(fq, tin, skb, + fq_skb_free_func, + fq_flow_get_default_func); +} - if (atomic_read(&sdata->num_tx_queued) >= TOTAL_MAX_TX_BUFFER || - txqi->queue.qlen >= STA_MAX_TX_BUFFER) { - ieee80211_free_txskb(&local->hw, skb); - return; +void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct txq_info *txqi, int tid) +{ + fq_tin_init(&txqi->tin); + fq_flow_init(&txqi->def_flow); + + txqi->txq.vif = &sdata->vif; + + if (sta) { + txqi->txq.sta = &sta->sta; + sta->sta.txq[tid] = &txqi->txq; + txqi->txq.tid = tid; + txqi->txq.ac = ieee802_1d_to_ac[tid & 7]; + } else { + sdata->vif.txq = &txqi->txq; + txqi->txq.tid = 0; + txqi->txq.ac = IEEE80211_AC_BE; } +} - atomic_inc(&sdata->num_tx_queued); - txqi->byte_cnt += skb->len; - __skb_queue_tail(&txqi->queue, skb); +void ieee80211_txq_purge(struct ieee80211_local *local, + struct txq_info *txqi) +{ + struct fq *fq = &local->fq; + struct fq_tin *tin = &txqi->tin; + + fq_tin_reset(fq, tin, fq_skb_free_func); +} + +int ieee80211_txq_setup_flows(struct ieee80211_local *local) +{ + struct fq *fq = &local->fq; + int ret; + + if (!local->ops->wake_tx_queue) + return 0; + + ret = fq_init(fq, 4096); + if (ret) + return ret; + + return 0; +} + +void ieee80211_txq_teardown_flows(struct ieee80211_local *local) +{ + struct fq *fq = &local->fq; + + if (!local->ops->wake_tx_queue) + return; + + fq_reset(fq, fq_skb_free_func); } struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct ieee80211_local *local = hw_to_local(hw); - struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); struct txq_info *txqi = container_of(txq, struct txq_info, txq); struct ieee80211_hdr *hdr; struct sk_buff *skb = NULL; + struct fq *fq = &local->fq; + struct fq_tin *tin = &txqi->tin; - spin_lock_bh(&txqi->queue.lock); + spin_lock_bh(&fq->lock); if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags)) goto out; - skb = __skb_dequeue(&txqi->queue); + skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); if (!skb) goto out; - atomic_dec(&sdata->num_tx_queued); - txqi->byte_cnt -= skb->len; - hdr = (struct ieee80211_hdr *)skb->data; if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) { struct sta_info *sta = container_of(txq->sta, struct sta_info, @@ -1320,7 +1396,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, } out: - spin_unlock_bh(&txqi->queue.lock); + spin_unlock_bh(&fq->lock); if (skb && skb_has_frag_list(skb) && !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) @@ -1337,6 +1413,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, bool txpending) { struct ieee80211_tx_control control = {}; + struct fq *fq = &local->fq; struct sk_buff *skb, *tmp; struct txq_info *txqi; unsigned long flags; @@ -1359,9 +1436,9 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, __skb_unlink(skb, skbs); - spin_lock_bh(&txqi->queue.lock); + spin_lock_bh(&fq->lock); ieee80211_txq_enqueue(local, txqi, skb); - spin_unlock_bh(&txqi->queue.lock); + spin_unlock_bh(&fq->lock); drv_wake_tx_queue(local, txqi); @@ -2893,6 +2970,9 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; + struct fq *fq = &local->fq; + struct fq_tin *tin; + struct fq_flow *flow; u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; struct ieee80211_txq *txq = sta->sta.txq[tid]; struct txq_info *txqi; @@ -2904,6 +2984,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, __be16 len; void *data; bool ret = false; + unsigned int orig_len; int n = 1, nfrags; if (!ieee80211_hw_check(&local->hw, TX_AMSDU)) @@ -2920,12 +3001,20 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, max_amsdu_len = min_t(int, max_amsdu_len, sta->sta.max_rc_amsdu_len); - spin_lock_bh(&txqi->queue.lock); + spin_lock_bh(&fq->lock); - head = skb_peek_tail(&txqi->queue); + /* TODO: Ideally aggregation should be done on dequeue to remain + * responsive to environment changes. + */ + + tin = &txqi->tin; + flow = fq_flow_classify(fq, tin, skb, fq_flow_get_default_func); + head = skb_peek_tail(&flow->queue); if (!head) goto out; + orig_len = head->len; + if (skb->len + head->len > max_amsdu_len) goto out; @@ -2964,8 +3053,13 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, head->data_len += skb->len; *frag_tail = skb; + flow->backlog += head->len - orig_len; + tin->backlog_bytes += head->len - orig_len; + + fq_recalc_backlog(fq, tin, flow); + out: - spin_unlock_bh(&txqi->queue.lock); + spin_unlock_bh(&fq->lock); return ret; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 0db46442bdcf..42bf0b6685e8 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3389,25 +3389,6 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo) return buf; } -void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, - struct txq_info *txqi, int tid) -{ - skb_queue_head_init(&txqi->queue); - txqi->txq.vif = &sdata->vif; - - if (sta) { - txqi->txq.sta = &sta->sta; - sta->sta.txq[tid] = &txqi->txq; - txqi->txq.tid = tid; - txqi->txq.ac = ieee802_1d_to_ac[tid & 7]; - } else { - sdata->vif.txq = &txqi->txq; - txqi->txq.tid = 0; - txqi->txq.ac = IEEE80211_AC_BE; - } -} - void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *frame_cnt, unsigned long *byte_cnt) @@ -3415,9 +3396,9 @@ void ieee80211_txq_get_depth(struct ieee80211_txq *txq, struct txq_info *txqi = to_txq_info(txq); if (frame_cnt) - *frame_cnt = txqi->queue.qlen; + *frame_cnt = txqi->tin.backlog_packets; if (byte_cnt) - *byte_cnt = txqi->byte_cnt; + *byte_cnt = txqi->tin.backlog_bytes; } EXPORT_SYMBOL(ieee80211_txq_get_depth); -- cgit v1.2.3 From 9399b86c0e9a058928e8b5f1e695056714814873 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Thu, 19 May 2016 10:37:50 +0200 Subject: mac80211: add debug knobs for fair queuing This adds a debugfs entry to read and modify some fq parameters. This makes it easy to debug, test and experiment. Signed-off-by: Michal Kazior [remove module parameter for now] Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) (limited to 'net') diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index b251b2f7f8dd..2906c1004e1a 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -10,6 +10,7 @@ #include #include +#include #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" @@ -70,6 +71,177 @@ DEBUGFS_READONLY_FILE(wep_iv, "%#08x", DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s", local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver"); +struct aqm_info { + struct ieee80211_local *local; + size_t size; + size_t len; + unsigned char buf[0]; +}; + +#define AQM_HDR_LEN 200 +#define AQM_HW_ENTRY_LEN 40 +#define AQM_TXQ_ENTRY_LEN 110 + +static int aqm_open(struct inode *inode, struct file *file) +{ + struct ieee80211_local *local = inode->i_private; + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + struct txq_info *txqi; + struct fq *fq = &local->fq; + struct aqm_info *info = NULL; + int len = 0; + int i; + + if (!local->ops->wake_tx_queue) + return -EOPNOTSUPP; + + len += AQM_HDR_LEN; + len += 6 * AQM_HW_ENTRY_LEN; + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) + len += AQM_TXQ_ENTRY_LEN; + list_for_each_entry_rcu(sta, &local->sta_list, list) + len += AQM_TXQ_ENTRY_LEN * ARRAY_SIZE(sta->sta.txq); + rcu_read_unlock(); + + info = vmalloc(len); + if (!info) + return -ENOMEM; + + spin_lock_bh(&local->fq.lock); + rcu_read_lock(); + + file->private_data = info; + info->local = local; + info->size = len; + len = 0; + + len += scnprintf(info->buf + len, info->size - len, + "* hw\n" + "access name value\n" + "R fq_flows_cnt %u\n" + "R fq_backlog %u\n" + "R fq_overlimit %u\n" + "R fq_collisions %u\n" + "RW fq_limit %u\n" + "RW fq_quantum %u\n", + fq->flows_cnt, + fq->backlog, + fq->overlimit, + fq->collisions, + fq->limit, + fq->quantum); + + len += scnprintf(info->buf + len, + info->size - len, + "* vif\n" + "ifname addr ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n"); + + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + txqi = to_txq_info(sdata->vif.txq); + len += scnprintf(info->buf + len, info->size - len, + "%s %pM %u %u %u %u %u %u %u %u\n", + sdata->name, + sdata->vif.addr, + txqi->txq.ac, + txqi->tin.backlog_bytes, + txqi->tin.backlog_packets, + txqi->tin.flows, + txqi->tin.overlimit, + txqi->tin.collisions, + txqi->tin.tx_bytes, + txqi->tin.tx_packets); + } + + len += scnprintf(info->buf + len, + info->size - len, + "* sta\n" + "ifname addr tid ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n"); + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + sdata = sta->sdata; + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + txqi = to_txq_info(sta->sta.txq[i]); + len += scnprintf(info->buf + len, info->size - len, + "%s %pM %d %d %u %u %u %u %u %u %u\n", + sdata->name, + sta->sta.addr, + txqi->txq.tid, + txqi->txq.ac, + txqi->tin.backlog_bytes, + txqi->tin.backlog_packets, + txqi->tin.flows, + txqi->tin.overlimit, + txqi->tin.collisions, + txqi->tin.tx_bytes, + txqi->tin.tx_packets); + } + } + + info->len = len; + + rcu_read_unlock(); + spin_unlock_bh(&local->fq.lock); + + return 0; +} + +static int aqm_release(struct inode *inode, struct file *file) +{ + vfree(file->private_data); + return 0; +} + +static ssize_t aqm_read(struct file *file, + char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct aqm_info *info = file->private_data; + + return simple_read_from_buffer(user_buf, count, ppos, + info->buf, info->len); +} + +static ssize_t aqm_write(struct file *file, + const char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct aqm_info *info = file->private_data; + struct ieee80211_local *local = info->local; + char buf[100]; + size_t len; + + if (count > sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, user_buf, count)) + return -EFAULT; + + buf[sizeof(buf) - 1] = '\0'; + len = strlen(buf); + if (len > 0 && buf[len-1] == '\n') + buf[len-1] = 0; + + if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1) + return count; + else if (sscanf(buf, "fq_quantum %u", &local->fq.quantum) == 1) + return count; + + return -EINVAL; +} + +static const struct file_operations aqm_ops = { + .write = aqm_write, + .read = aqm_read, + .open = aqm_open, + .release = aqm_release, + .llseek = default_llseek, +}; + #ifdef CONFIG_PM static ssize_t reset_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) @@ -256,6 +428,7 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD(hwflags); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); + DEBUGFS_ADD_MODE(aqm, 0600); statsd = debugfs_create_dir("statistics", phyd); -- cgit v1.2.3 From 5caa328e3811b7cfa33fd02c93280ffa622deb0e Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Thu, 19 May 2016 10:37:51 +0200 Subject: mac80211: implement codel on fair queuing flows There is no other limit other than a global packet count limit when using software queuing. This means a single flow queue can grow insanely long. This is particularly bad for TCP congestion algorithms which requires a little more sophisticated frame dropping scheme than a mere headdrop on limit overflow. Hence apply (a slighly modified, to fit the knobs) CoDel5 on flow queues. This improves TCP convergence and stability when combined with wireless driver which keeps its own tx queue/fifo at a minimum fill level for given link conditions. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 5 +++ net/mac80211/tx.c | 109 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6f8375f1df88..54edfb6fc1d1 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -812,10 +812,12 @@ enum txq_info_flags { * @tin: contains packets split into multiple flows * @def_flow: used as a fallback flow when a packet destined to @tin hashes to * a fq_flow which is already owned by a different tin + * @def_cvars: codel vars for @def_flow */ struct txq_info { struct fq_tin tin; struct fq_flow def_flow; + struct codel_vars def_cvars; unsigned long flags; /* keep last! */ @@ -1108,6 +1110,9 @@ struct ieee80211_local { struct ieee80211_hw hw; struct fq fq; + struct codel_vars *cvars; + struct codel_params cparams; + struct codel_stats cstats; const struct ieee80211_ops *ops; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1d8343fca6d4..44ec605a5682 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include @@ -1267,11 +1269,92 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, return to_txq_info(txq); } +static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) +{ + IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); +} + +static void ieee80211_set_skb_vif(struct sk_buff *skb, struct txq_info *txqi) +{ + IEEE80211_SKB_CB(skb)->control.vif = txqi->txq.vif; +} + +static u32 codel_skb_len_func(const struct sk_buff *skb) +{ + return skb->len; +} + +static codel_time_t codel_skb_time_func(const struct sk_buff *skb) +{ + const struct ieee80211_tx_info *info; + + info = (const struct ieee80211_tx_info *)skb->cb; + return info->control.enqueue_time; +} + +static struct sk_buff *codel_dequeue_func(struct codel_vars *cvars, + void *ctx) +{ + struct ieee80211_local *local; + struct txq_info *txqi; + struct fq *fq; + struct fq_flow *flow; + + txqi = ctx; + local = vif_to_sdata(txqi->txq.vif)->local; + fq = &local->fq; + + if (cvars == &txqi->def_cvars) + flow = &txqi->def_flow; + else + flow = &fq->flows[cvars - local->cvars]; + + return fq_flow_dequeue(fq, flow); +} + +static void codel_drop_func(struct sk_buff *skb, + void *ctx) +{ + struct ieee80211_local *local; + struct ieee80211_hw *hw; + struct txq_info *txqi; + + txqi = ctx; + local = vif_to_sdata(txqi->txq.vif)->local; + hw = &local->hw; + + ieee80211_free_txskb(hw, skb); +} + static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow) { - return fq_flow_dequeue(fq, flow); + struct ieee80211_local *local; + struct txq_info *txqi; + struct codel_vars *cvars; + struct codel_params *cparams; + struct codel_stats *cstats; + + local = container_of(fq, struct ieee80211_local, fq); + txqi = container_of(tin, struct txq_info, tin); + cparams = &local->cparams; + cstats = &local->cstats; + + if (flow == &txqi->def_flow) + cvars = &txqi->def_cvars; + else + cvars = &local->cvars[flow - fq->flows]; + + return codel_dequeue(txqi, + &flow->backlog, + cparams, + cvars, + cstats, + codel_skb_len_func, + codel_skb_time_func, + codel_drop_func, + codel_dequeue_func); } static void fq_skb_free_func(struct fq *fq, @@ -1303,6 +1386,7 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local, struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; + ieee80211_set_skb_enqueue_time(skb); fq_tin_enqueue(fq, tin, skb, fq_skb_free_func, fq_flow_get_default_func); @@ -1314,6 +1398,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, { fq_tin_init(&txqi->tin); fq_flow_init(&txqi->def_flow); + codel_vars_init(&txqi->def_cvars); txqi->txq.vif = &sdata->vif; @@ -1342,6 +1427,7 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; int ret; + int i; if (!local->ops->wake_tx_queue) return 0; @@ -1350,6 +1436,22 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) if (ret) return ret; + codel_params_init(&local->cparams); + codel_stats_init(&local->cstats); + local->cparams.interval = MS2TIME(100); + local->cparams.target = MS2TIME(20); + local->cparams.ecn = true; + + local->cvars = kcalloc(fq->flows_cnt, sizeof(local->cvars[0]), + GFP_KERNEL); + if (!local->cvars) { + fq_reset(fq, fq_skb_free_func); + return -ENOMEM; + } + + for (i = 0; i < fq->flows_cnt; i++) + codel_vars_init(&local->cvars[i]); + return 0; } @@ -1360,6 +1462,9 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local) if (!local->ops->wake_tx_queue) return; + kfree(local->cvars); + local->cvars = NULL; + fq_reset(fq, fq_skb_free_func); } @@ -1382,6 +1487,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, if (!skb) goto out; + ieee80211_set_skb_vif(skb, txqi); + hdr = (struct ieee80211_hdr *)skb->data; if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) { struct sta_info *sta = container_of(txq->sta, struct sta_info, -- cgit v1.2.3 From adba931fbc825efca7c821f0d76baed0a8dc9189 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 9 Jun 2016 18:03:34 +0900 Subject: sit: remove unnecessary protocol check in ipip6_tunnel_xmit() ipip6_tunnel_xmit() is called immediately after checking that skb->protocol is htons(ETH_P_IPV6) so there is no need to check it a second time. Found by inspection. Signed-off-by: Simon Horman Reviewed-by: Dinan Gunawardena Signed-off-by: David S. Miller --- net/ipv6/sit.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 0a5a255277e5..d9f2bd6ef72d 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -825,9 +825,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, u8 protocol = IPPROTO_IPV6; int t_hlen = tunnel->hlen + sizeof(struct iphdr); - if (skb->protocol != htons(ETH_P_IPV6)) - goto tx_error; - if (tos == 1) tos = ipv6_get_dsfield(iph6); -- cgit v1.2.3 From 52fbb2907988aa0583c6d9d53a56aee090b2df7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Jun 2016 07:45:11 -0700 Subject: net: sched: fix qdisc->running lockdep annotations 1) qdisc_run_begin() is really using the equivalent of a trylock. Instead of using write_seqcount_begin(), use a combination of raw_write_seqcount_begin() and correct lockdep annotation. 2) sch_direct_xmit() should use regular spin_lock(root_lock) Fixes: f9eb8aea2a1e ("net_sched: transform qdisc running bit into a seqcount") Signed-off-by: Eric Dumazet Reported-by: David Ahern Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cebea73e70ac..cad810b45e31 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -137,10 +137,10 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_UNLOCK(dev, txq); } else { - spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING); + spin_lock(root_lock); return qdisc_qlen(q); } - spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING); + spin_lock(root_lock); if (dev_xmit_complete(ret)) { /* Driver sent out skb successfully or skb was consumed */ -- cgit v1.2.3 From d3fff6c443fe8f8a5ef2bdcea45e2ff39db948c7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Jun 2016 07:45:12 -0700 Subject: net: add netdev_lockdep_set_classes() helper It is time to add netdev_lockdep_set_classes() helper so that lockdep annotations per device type are easier to manage. This removes a lot of copies and missing annotations. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/bluetooth/6lowpan.c | 15 +-------------- net/ieee802154/6lowpan/core.c | 16 +--------------- net/l2tp/l2tp_eth.c | 6 +----- 3 files changed, 3 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 977a11e418d0..d020299baba4 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -627,22 +627,9 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) return err < 0 ? NET_XMIT_DROP : err; } -static struct lock_class_key bt_tx_busylock; -static struct lock_class_key bt_netdev_xmit_lock_key; -static struct lock_class_key bt_qdisc_running_key; - -static void bt_set_lockdep_class_one(struct net_device *dev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, &bt_netdev_xmit_lock_key); -} - static int bt_dev_init(struct net_device *dev) { - netdev_for_each_tx_queue(dev, bt_set_lockdep_class_one, NULL); - dev->qdisc_tx_busylock = &bt_tx_busylock; - dev->qdisc_running_key = &bt_qdisc_running_key; + netdev_lockdep_set_classes(dev); return 0; } diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 14aa5effd29a..4e2b30894224 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -58,23 +58,9 @@ static struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; -static struct lock_class_key lowpan_tx_busylock; -static struct lock_class_key lowpan_netdev_xmit_lock_key; -static struct lock_class_key lowpan_qdisc_running_key; - -static void lowpan_set_lockdep_class_one(struct net_device *ldev, - struct netdev_queue *txq, - void *_unused) -{ - lockdep_set_class(&txq->_xmit_lock, - &lowpan_netdev_xmit_lock_key); -} - static int lowpan_dev_init(struct net_device *ldev) { - netdev_for_each_tx_queue(ldev, lowpan_set_lockdep_class_one, NULL); - ldev->qdisc_tx_busylock = &lowpan_tx_busylock; - ldev->qdisc_running_key = &lowpan_qdisc_running_key; + netdev_lockdep_set_classes(ldev); return 0; } diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index c00d72d182fa..57fc5a46ce06 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -67,9 +67,6 @@ static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) return net_generic(net, l2tp_eth_net_id); } -static struct lock_class_key l2tp_eth_tx_busylock; -static struct lock_class_key l2tp_qdisc_running_key; - static int l2tp_eth_dev_init(struct net_device *dev) { struct l2tp_eth *priv = netdev_priv(dev); @@ -77,8 +74,7 @@ static int l2tp_eth_dev_init(struct net_device *dev) priv->dev = dev; eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); - dev->qdisc_tx_busylock = &l2tp_eth_tx_busylock; - dev->qdisc_running_key = &l2tp_qdisc_running_key; + netdev_lockdep_set_classes(dev); return 0; } -- cgit v1.2.3 From 21aff3b905ad9e5e52b18a755c13fe755bd6ab3d Mon Sep 17 00:00:00 2001 From: Fabien Siron Date: Tue, 7 Jun 2016 13:02:04 +0000 Subject: net/netlink/af_netlink.h: Remove unused structure. Signed-off-by: Fabien Siron Signed-off-by: David S. Miller --- net/netlink/af_netlink.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index e68ef9ccd703..3cfd6cc60504 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -8,20 +8,6 @@ #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) -struct netlink_ring { - void **pg_vec; - unsigned int head; - unsigned int frames_per_block; - unsigned int frame_size; - unsigned int frame_max; - - unsigned int pg_vec_order; - unsigned int pg_vec_pages; - unsigned int pg_vec_len; - - atomic_t pending; -}; - struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; -- cgit v1.2.3 From 2341e0775747864b684abe8627f3d45b167f2940 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 9 Jun 2016 23:02:51 +0100 Subject: rxrpc: Simplify connect() implementation and simplify sendmsg() op Simplify the RxRPC connect() implementation. It will just note the destination address it is given, and if a sendmsg() comes along with no address, this will be assigned as the address. No transport struct will be held internally, which will allow us to remove this later. Simplify sendmsg() also. Whilst a call is active, userspace refers to it by a private unique user ID specified in a control message. When sendmsg() sees a user ID that doesn't map to an extant call, it creates a new call for that user ID and attempts to add it. If, when we try to add it, the user ID is now registered, we now reject the message with -EEXIST. We should never see this situation unless two threads are racing, trying to create a call with the same ID - which would be an error. It also isn't required to provide sendmsg() with an address - provided the control message data holds a user ID that maps to a currently active call. Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/af_rxrpc.c | 177 ++++++++++++++++--------------------------- net/rxrpc/ar-call.c | 158 ++++++++++++++++----------------------- net/rxrpc/ar-connection.c | 17 ----- net/rxrpc/ar-internal.h | 22 +++--- net/rxrpc/ar-output.c | 186 ++++++++++++++++++++++------------------------ 5 files changed, 224 insertions(+), 336 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 7840b8e7da80..38512a200db6 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -139,33 +139,33 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) lock_sock(&rx->sk); - if (rx->sk.sk_state != RXRPC_UNCONNECTED) { + if (rx->sk.sk_state != RXRPC_UNBOUND) { ret = -EINVAL; goto error_unlock; } memcpy(&rx->srx, srx, sizeof(rx->srx)); - /* Find or create a local transport endpoint to use */ local = rxrpc_lookup_local(&rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); goto error_unlock; } - rx->local = local; - if (srx->srx_service) { + if (rx->srx.srx_service) { write_lock_bh(&local->services_lock); list_for_each_entry(prx, &local->services, listen_link) { - if (prx->srx.srx_service == srx->srx_service) + if (prx->srx.srx_service == rx->srx.srx_service) goto service_in_use; } + rx->local = local; list_add_tail(&rx->listen_link, &local->services); write_unlock_bh(&local->services_lock); rx->sk.sk_state = RXRPC_SERVER_BOUND; } else { + rx->local = local; rx->sk.sk_state = RXRPC_CLIENT_BOUND; } @@ -174,8 +174,9 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) return 0; service_in_use: - ret = -EADDRINUSE; write_unlock_bh(&local->services_lock); + rxrpc_put_local(local); + ret = -EADDRINUSE; error_unlock: release_sock(&rx->sk); error: @@ -197,11 +198,11 @@ static int rxrpc_listen(struct socket *sock, int backlog) lock_sock(&rx->sk); switch (rx->sk.sk_state) { - case RXRPC_UNCONNECTED: + case RXRPC_UNBOUND: ret = -EADDRNOTAVAIL; break; + case RXRPC_CLIENT_UNBOUND: case RXRPC_CLIENT_BOUND: - case RXRPC_CLIENT_CONNECTED: default: ret = -EBUSY; break; @@ -221,20 +222,18 @@ static int rxrpc_listen(struct socket *sock, int backlog) /* * find a transport by address */ -static struct rxrpc_transport *rxrpc_name_to_transport(struct socket *sock, - struct sockaddr *addr, - int addr_len, int flags, - gfp_t gfp) +struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_sock *rx, + struct sockaddr *addr, + int addr_len, int flags, + gfp_t gfp) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr; struct rxrpc_transport *trans; - struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct rxrpc_peer *peer; _enter("%p,%p,%d,%d", rx, addr, addr_len, flags); ASSERT(rx->local != NULL); - ASSERT(rx->sk.sk_state > RXRPC_UNCONNECTED); if (rx->srx.transport_type != srx->transport_type) return ERR_PTR(-ESOCKTNOSUPPORT); @@ -256,7 +255,7 @@ static struct rxrpc_transport *rxrpc_name_to_transport(struct socket *sock, /** * rxrpc_kernel_begin_call - Allow a kernel service to begin a call * @sock: The socket on which to make the call - * @srx: The address of the peer to contact (defaults to socket setting) + * @srx: The address of the peer to contact * @key: The security context to use (defaults to socket setting) * @user_call_ID: The ID to use * @@ -282,25 +281,14 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, lock_sock(&rx->sk); - if (srx) { - trans = rxrpc_name_to_transport(sock, (struct sockaddr *) srx, - sizeof(*srx), 0, gfp); - if (IS_ERR(trans)) { - call = ERR_CAST(trans); - trans = NULL; - goto out_notrans; - } - } else { - trans = rx->trans; - if (!trans) { - call = ERR_PTR(-ENOTCONN); - goto out_notrans; - } - atomic_inc(&trans->usage); + trans = rxrpc_name_to_transport(rx, (struct sockaddr *)srx, + sizeof(*srx), 0, gfp); + if (IS_ERR(trans)) { + call = ERR_CAST(trans); + trans = NULL; + goto out_notrans; } - if (!srx) - srx = &rx->srx; if (!key) key = rx->key; if (key && !key->payload.data[0]) @@ -312,8 +300,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, goto out; } - call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID, true, - gfp); + call = rxrpc_new_client_call(rx, trans, bundle, user_call_ID, gfp); rxrpc_put_bundle(trans, bundle); out: rxrpc_put_transport(trans); @@ -369,11 +356,8 @@ EXPORT_SYMBOL(rxrpc_kernel_intercept_rx_messages); static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, int addr_len, int flags) { - struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr; - struct sock *sk = sock->sk; - struct rxrpc_transport *trans; - struct rxrpc_local *local; - struct rxrpc_sock *rx = rxrpc_sk(sk); + struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)addr; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; _enter("%p,%p,%d,%d", rx, addr, addr_len, flags); @@ -386,45 +370,28 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, lock_sock(&rx->sk); + ret = -EISCONN; + if (test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) + goto error; + switch (rx->sk.sk_state) { - case RXRPC_UNCONNECTED: - /* find a local transport endpoint if we don't have one already */ - ASSERTCMP(rx->local, ==, NULL); - rx->srx.srx_family = AF_RXRPC; - rx->srx.srx_service = 0; - rx->srx.transport_type = srx->transport_type; - rx->srx.transport_len = sizeof(sa_family_t); - rx->srx.transport.family = srx->transport.family; - local = rxrpc_lookup_local(&rx->srx); - if (IS_ERR(local)) { - release_sock(&rx->sk); - return PTR_ERR(local); - } - rx->local = local; - rx->sk.sk_state = RXRPC_CLIENT_BOUND; + case RXRPC_UNBOUND: + rx->sk.sk_state = RXRPC_CLIENT_UNBOUND; + case RXRPC_CLIENT_UNBOUND: case RXRPC_CLIENT_BOUND: break; - case RXRPC_CLIENT_CONNECTED: - release_sock(&rx->sk); - return -EISCONN; default: - release_sock(&rx->sk); - return -EBUSY; /* server sockets can't connect as well */ - } - - trans = rxrpc_name_to_transport(sock, addr, addr_len, flags, - GFP_KERNEL); - if (IS_ERR(trans)) { - release_sock(&rx->sk); - _leave(" = %ld", PTR_ERR(trans)); - return PTR_ERR(trans); + ret = -EBUSY; + goto error; } - rx->trans = trans; - rx->sk.sk_state = RXRPC_CLIENT_CONNECTED; + rx->connect_srx = *srx; + set_bit(RXRPC_SOCK_CONNECTED, &rx->flags); + ret = 0; +error: release_sock(&rx->sk); - return 0; + return ret; } /* @@ -438,7 +405,7 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, */ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { - struct rxrpc_transport *trans; + struct rxrpc_local *local; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; @@ -455,48 +422,38 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) } } - trans = NULL; lock_sock(&rx->sk); - if (m->msg_name) { - ret = -EISCONN; - trans = rxrpc_name_to_transport(sock, m->msg_name, - m->msg_namelen, 0, GFP_KERNEL); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - } else { - trans = rx->trans; - if (trans) - atomic_inc(&trans->usage); - } - switch (rx->sk.sk_state) { - case RXRPC_SERVER_LISTENING: - if (!m->msg_name) { - ret = rxrpc_server_sendmsg(rx, m, len); - break; + case RXRPC_UNBOUND: + local = rxrpc_lookup_local(&rx->srx); + if (IS_ERR(local)) { + ret = PTR_ERR(local); + goto error_unlock; } - case RXRPC_SERVER_BOUND: + + rx->local = local; + rx->sk.sk_state = RXRPC_CLIENT_UNBOUND; + /* Fall through */ + + case RXRPC_CLIENT_UNBOUND: case RXRPC_CLIENT_BOUND: - if (!m->msg_name) { - ret = -ENOTCONN; - break; + if (!m->msg_name && + test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) { + m->msg_name = &rx->connect_srx; + m->msg_namelen = sizeof(rx->connect_srx); } - case RXRPC_CLIENT_CONNECTED: - ret = rxrpc_client_sendmsg(rx, trans, m, len); + case RXRPC_SERVER_BOUND: + case RXRPC_SERVER_LISTENING: + ret = rxrpc_do_sendmsg(rx, m, len); break; default: - ret = -ENOTCONN; + ret = -EINVAL; break; } -out: +error_unlock: release_sock(&rx->sk); - if (trans) - rxrpc_put_transport(trans); _leave(" = %d", ret); return ret; } @@ -523,7 +480,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, if (optlen != 0) goto error; ret = -EISCONN; - if (rx->sk.sk_state != RXRPC_UNCONNECTED) + if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; set_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags); goto success; @@ -533,7 +490,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, if (rx->key) goto error; ret = -EISCONN; - if (rx->sk.sk_state != RXRPC_UNCONNECTED) + if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = rxrpc_request_key(rx, optval, optlen); goto error; @@ -543,7 +500,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, if (rx->key) goto error; ret = -EISCONN; - if (rx->sk.sk_state != RXRPC_UNCONNECTED) + if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = rxrpc_server_keyring(rx, optval, optlen); goto error; @@ -553,7 +510,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(unsigned int)) goto error; ret = -EISCONN; - if (rx->sk.sk_state != RXRPC_UNCONNECTED) + if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = get_user(min_sec_level, (unsigned int __user *) optval); @@ -632,7 +589,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, return -ENOMEM; sock_init_data(sock, sk); - sk->sk_state = RXRPC_UNCONNECTED; + sk->sk_state = RXRPC_UNBOUND; sk->sk_write_space = rxrpc_write_space; sk->sk_max_ack_backlog = sysctl_rxrpc_max_qlen; sk->sk_destruct = rxrpc_sock_destructor; @@ -705,14 +662,6 @@ static int rxrpc_release_sock(struct sock *sk) rx->conn = NULL; } - if (rx->bundle) { - rxrpc_put_bundle(rx->trans, rx->bundle); - rx->bundle = NULL; - } - if (rx->trans) { - rxrpc_put_transport(rx->trans); - rx->trans = NULL; - } if (rx->local) { rxrpc_put_local(rx->local); rx->local = NULL; diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c index 1fbaae1cba5f..68125dc4cb7c 100644 --- a/net/rxrpc/ar-call.c +++ b/net/rxrpc/ar-call.c @@ -195,6 +195,43 @@ struct rxrpc_call *rxrpc_find_call_hash( return ret; } +/* + * find an extant server call + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("%p,%lx", rx, user_call_ID); + + read_lock(&rx->call_lock); + + p = rx->calls.rb_node; + while (p) { + call = rb_entry(p, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + p = p->rb_left; + else if (user_call_ID > call->user_call_ID) + p = p->rb_right; + else + goto found_extant_call; + } + + read_unlock(&rx->call_lock); + _leave(" = NULL"); + return NULL; + +found_extant_call: + rxrpc_get_call(call); + read_unlock(&rx->call_lock); + _leave(" = %p [%d]", call, atomic_read(&call->usage)); + return call; +} + /* * allocate a new call */ @@ -311,51 +348,27 @@ static struct rxrpc_call *rxrpc_alloc_client_call( * set up a call for the given data * - called in process context with IRQs enabled */ -struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *rx, +struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_transport *trans, struct rxrpc_conn_bundle *bundle, unsigned long user_call_ID, - int create, gfp_t gfp) { - struct rxrpc_call *call, *candidate; - struct rb_node *p, *parent, **pp; + struct rxrpc_call *call, *xcall; + struct rb_node *parent, **pp; - _enter("%p,%d,%d,%lx,%d", - rx, trans ? trans->debug_id : -1, bundle ? bundle->debug_id : -1, - user_call_ID, create); + _enter("%p,%d,%d,%lx", + rx, trans->debug_id, bundle ? bundle->debug_id : -1, + user_call_ID); - /* search the extant calls first for one that matches the specified - * user ID */ - read_lock(&rx->call_lock); - - p = rx->calls.rb_node; - while (p) { - call = rb_entry(p, struct rxrpc_call, sock_node); - - if (user_call_ID < call->user_call_ID) - p = p->rb_left; - else if (user_call_ID > call->user_call_ID) - p = p->rb_right; - else - goto found_extant_call; + call = rxrpc_alloc_client_call(rx, trans, bundle, gfp); + if (IS_ERR(call)) { + _leave(" = %ld", PTR_ERR(call)); + return call; } - read_unlock(&rx->call_lock); - - if (!create || !trans) - return ERR_PTR(-EBADSLT); - - /* not yet present - create a candidate for a new record and then - * redo the search */ - candidate = rxrpc_alloc_client_call(rx, trans, bundle, gfp); - if (IS_ERR(candidate)) { - _leave(" = %ld", PTR_ERR(candidate)); - return candidate; - } - - candidate->user_call_ID = user_call_ID; - __set_bit(RXRPC_CALL_HAS_USERID, &candidate->flags); + call->user_call_ID = user_call_ID; + __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); write_lock(&rx->call_lock); @@ -363,19 +376,16 @@ struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *rx, parent = NULL; while (*pp) { parent = *pp; - call = rb_entry(parent, struct rxrpc_call, sock_node); + xcall = rb_entry(parent, struct rxrpc_call, sock_node); - if (user_call_ID < call->user_call_ID) + if (user_call_ID < xcall->user_call_ID) pp = &(*pp)->rb_left; - else if (user_call_ID > call->user_call_ID) + else if (user_call_ID > xcall->user_call_ID) pp = &(*pp)->rb_right; else - goto found_extant_second; + goto found_user_ID_now_present; } - /* second search also failed; add the new call */ - call = candidate; - candidate = NULL; rxrpc_get_call(call); rb_link_node(&call->sock_node, parent, pp); @@ -391,20 +401,16 @@ struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *rx, _leave(" = %p [new]", call); return call; - /* we found the call in the list immediately */ -found_extant_call: - rxrpc_get_call(call); - read_unlock(&rx->call_lock); - _leave(" = %p [extant %d]", call, atomic_read(&call->usage)); - return call; - - /* we found the call on the second time through the list */ -found_extant_second: - rxrpc_get_call(call); + /* We unexpectedly found the user ID in the list after taking + * the call_lock. This shouldn't happen unless the user races + * with itself and tries to add the same user ID twice at the + * same time in different threads. + */ +found_user_ID_now_present: write_unlock(&rx->call_lock); - rxrpc_put_call(candidate); - _leave(" = %p [second %d]", call, atomic_read(&call->usage)); - return call; + rxrpc_put_call(call); + _leave(" = -EEXIST [%p]", call); + return ERR_PTR(-EEXIST); } /* @@ -565,46 +571,6 @@ old_call: return ERR_PTR(-ECONNRESET); } -/* - * find an extant server call - * - called in process context with IRQs enabled - */ -struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *rx, - unsigned long user_call_ID) -{ - struct rxrpc_call *call; - struct rb_node *p; - - _enter("%p,%lx", rx, user_call_ID); - - /* search the extant calls for one that matches the specified user - * ID */ - read_lock(&rx->call_lock); - - p = rx->calls.rb_node; - while (p) { - call = rb_entry(p, struct rxrpc_call, sock_node); - - if (user_call_ID < call->user_call_ID) - p = p->rb_left; - else if (user_call_ID > call->user_call_ID) - p = p->rb_right; - else - goto found_extant_call; - } - - read_unlock(&rx->call_lock); - _leave(" = NULL"); - return NULL; - - /* we found the call in the list immediately */ -found_extant_call: - rxrpc_get_call(call); - read_unlock(&rx->call_lock); - _leave(" = %p [%d]", call, atomic_read(&call->usage)); - return call; -} - /* * detach a call from a socket and set up for release */ diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index d67b1f1b5001..8ecde4b77b55 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -80,11 +80,6 @@ struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx, _enter("%p{%x},%x,%hx,", rx, key_serial(key), trans->debug_id, service_id); - if (rx->trans == trans && rx->bundle) { - atomic_inc(&rx->bundle->usage); - return rx->bundle; - } - /* search the extant bundles first for one that matches the specified * user ID */ spin_lock(&trans->client_lock); @@ -138,10 +133,6 @@ struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx, rb_insert_color(&bundle->node, &trans->bundles); spin_unlock(&trans->client_lock); _net("BUNDLE new on trans %d", trans->debug_id); - if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) { - atomic_inc(&bundle->usage); - rx->bundle = bundle; - } _leave(" = %p [new]", bundle); return bundle; @@ -150,10 +141,6 @@ found_extant_bundle: atomic_inc(&bundle->usage); spin_unlock(&trans->client_lock); _net("BUNDLE old on trans %d", trans->debug_id); - if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) { - atomic_inc(&bundle->usage); - rx->bundle = bundle; - } _leave(" = %p [extant %d]", bundle, atomic_read(&bundle->usage)); return bundle; @@ -163,10 +150,6 @@ found_extant_second: spin_unlock(&trans->client_lock); kfree(candidate); _net("BUNDLE old2 on trans %d", trans->debug_id); - if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) { - atomic_inc(&bundle->usage); - rx->bundle = bundle; - } _leave(" = %p [second %d]", bundle, atomic_read(&bundle->usage)); return bundle; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 18ab5c50ba87..b89dcdcbc65a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -39,9 +39,9 @@ struct rxrpc_crypt { * sk_state for RxRPC sockets */ enum { - RXRPC_UNCONNECTED = 0, + RXRPC_UNBOUND = 0, + RXRPC_CLIENT_UNBOUND, /* Unbound socket used as client */ RXRPC_CLIENT_BOUND, /* client local address bound */ - RXRPC_CLIENT_CONNECTED, /* client is connected */ RXRPC_SERVER_BOUND, /* server local address bound */ RXRPC_SERVER_LISTENING, /* server listening for connections */ RXRPC_CLOSE, /* socket is being closed */ @@ -55,8 +55,6 @@ struct rxrpc_sock { struct sock sk; rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */ struct rxrpc_local *local; /* local endpoint */ - struct rxrpc_transport *trans; /* transport handler */ - struct rxrpc_conn_bundle *bundle; /* virtual connection bundle */ struct rxrpc_connection *conn; /* exclusive virtual connection */ struct list_head listen_link; /* link in the local endpoint's listen list */ struct list_head secureq; /* calls awaiting connection security clearance */ @@ -65,11 +63,13 @@ struct rxrpc_sock { struct key *securities; /* list of server security descriptors */ struct rb_root calls; /* outstanding calls on this socket */ unsigned long flags; +#define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ #define RXRPC_SOCK_EXCLUSIVE_CONN 1 /* exclusive connection for a client socket */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT struct sockaddr_rxrpc srx; /* local address */ + struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */ sa_family_t proto; /* protocol created with */ }; @@ -477,6 +477,10 @@ extern u32 rxrpc_epoch; extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; +extern struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_sock *, + struct sockaddr *, + int, int, gfp_t); + /* * ar-accept.c */ @@ -502,14 +506,14 @@ extern rwlock_t rxrpc_call_lock; struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *, void *, sa_family_t, const void *); -struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *, +struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); +struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_transport *, struct rxrpc_conn_bundle *, - unsigned long, int, gfp_t); + unsigned long, gfp_t); struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_connection *, struct rxrpc_host_header *); -struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, unsigned long); void rxrpc_release_call(struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); void __rxrpc_put_call(struct rxrpc_call *); @@ -581,9 +585,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t, extern unsigned int rxrpc_resend_timeout; int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); -int rxrpc_client_sendmsg(struct rxrpc_sock *, struct rxrpc_transport *, - struct msghdr *, size_t); -int rxrpc_server_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); +int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* * ar-peer.c diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index ea619535f0ed..2e3c4064e29c 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -32,13 +32,13 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, /* * extract control messages from the sendmsg() control buffer */ -static int rxrpc_sendmsg_cmsg(struct rxrpc_sock *rx, struct msghdr *msg, +static int rxrpc_sendmsg_cmsg(struct msghdr *msg, unsigned long *user_call_ID, enum rxrpc_command *command, - u32 *abort_code, - bool server) + u32 *abort_code) { struct cmsghdr *cmsg; + bool got_user_ID = false; int len; *command = RXRPC_CMD_SEND_DATA; @@ -70,6 +70,7 @@ static int rxrpc_sendmsg_cmsg(struct rxrpc_sock *rx, struct msghdr *msg, CMSG_DATA(cmsg); } _debug("User Call ID %lx", *user_call_ID); + got_user_ID = true; break; case RXRPC_ABORT: @@ -90,8 +91,6 @@ static int rxrpc_sendmsg_cmsg(struct rxrpc_sock *rx, struct msghdr *msg, *command = RXRPC_CMD_ACCEPT; if (len != 0) return -EINVAL; - if (!server) - return -EISCONN; break; default: @@ -99,6 +98,8 @@ static int rxrpc_sendmsg_cmsg(struct rxrpc_sock *rx, struct msghdr *msg, } } + if (!got_user_ID) + return -EINVAL; _leave(" = 0"); return 0; } @@ -125,56 +126,97 @@ static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) write_unlock_bh(&call->state_lock); } +/* + * Create a new client call for sendmsg(). + */ +static struct rxrpc_call * +rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, + unsigned long user_call_ID) +{ + struct rxrpc_conn_bundle *bundle; + struct rxrpc_transport *trans; + struct rxrpc_call *call; + struct key *key; + long ret; + + DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); + + _enter(""); + + if (!msg->msg_name) + return ERR_PTR(-EDESTADDRREQ); + + trans = rxrpc_name_to_transport(rx, msg->msg_name, msg->msg_namelen, 0, + GFP_KERNEL); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + key = rx->key; + if (key && !rx->key->payload.data[0]) + key = NULL; + bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, GFP_KERNEL); + if (IS_ERR(bundle)) { + ret = PTR_ERR(bundle); + goto out_trans; + } + + call = rxrpc_new_client_call(rx, trans, bundle, user_call_ID, + GFP_KERNEL); + rxrpc_put_bundle(trans, bundle); + rxrpc_put_transport(trans); + if (IS_ERR(call)) { + ret = PTR_ERR(call); + goto out_trans; + } + + _leave(" = %p\n", call); + return call; + +out_trans: + rxrpc_put_transport(trans); +out: + _leave(" = %ld", ret); + return ERR_PTR(ret); +} + /* * send a message forming part of a client call through an RxRPC socket * - caller holds the socket locked * - the socket may be either a client socket or a server socket */ -int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans, - struct msghdr *msg, size_t len) +int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) { - struct rxrpc_conn_bundle *bundle; enum rxrpc_command cmd; struct rxrpc_call *call; unsigned long user_call_ID = 0; - struct key *key; - u16 service_id; u32 abort_code = 0; int ret; _enter(""); - ASSERT(trans != NULL); - - ret = rxrpc_sendmsg_cmsg(rx, msg, &user_call_ID, &cmd, &abort_code, - false); + ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code); if (ret < 0) return ret; - bundle = NULL; - if (trans) { - service_id = rx->srx.srx_service; - if (msg->msg_name) { - DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, - msg->msg_name); - service_id = srx->srx_service; - } - key = rx->key; - if (key && !rx->key->payload.data[0]) - key = NULL; - bundle = rxrpc_get_bundle(rx, trans, key, service_id, - GFP_KERNEL); - if (IS_ERR(bundle)) - return PTR_ERR(bundle); + if (cmd == RXRPC_CMD_ACCEPT) { + if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) + return -EINVAL; + call = rxrpc_accept_call(rx, user_call_ID); + if (IS_ERR(call)) + return PTR_ERR(call); + rxrpc_put_call(call); + return 0; } - call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID, - abort_code == 0, GFP_KERNEL); - if (trans) - rxrpc_put_bundle(trans, bundle); - if (IS_ERR(call)) { - _leave(" = %ld", PTR_ERR(call)); - return PTR_ERR(call); + call = rxrpc_find_call_by_user_ID(rx, user_call_ID); + if (!call) { + if (cmd != RXRPC_CMD_SEND_DATA) + return -EBADSLT; + call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID); + if (IS_ERR(call)) + return PTR_ERR(call); } _debug("CALL %d USR %lx ST %d on CONN %p", @@ -182,14 +224,21 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans, if (call->state >= RXRPC_CALL_COMPLETE) { /* it's too late for this call */ - ret = -ESHUTDOWN; + ret = -ECONNRESET; } else if (cmd == RXRPC_CMD_SEND_ABORT) { rxrpc_send_abort(call, abort_code); + ret = 0; } else if (cmd != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; - } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { + } else if (!call->in_clientflag && + call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { /* request phase complete for this client call */ ret = -EPROTO; + } else if (call->in_clientflag && + call->state != RXRPC_CALL_SERVER_ACK_REQUEST && + call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + /* Reply phase not begun or not complete for service call. */ + ret = -EPROTO; } else { ret = rxrpc_send_data(rx, call, msg, len); } @@ -267,67 +316,6 @@ void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code) EXPORT_SYMBOL(rxrpc_kernel_abort_call); -/* - * send a message through a server socket - * - caller holds the socket locked - */ -int rxrpc_server_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) -{ - enum rxrpc_command cmd; - struct rxrpc_call *call; - unsigned long user_call_ID = 0; - u32 abort_code = 0; - int ret; - - _enter(""); - - ret = rxrpc_sendmsg_cmsg(rx, msg, &user_call_ID, &cmd, &abort_code, - true); - if (ret < 0) - return ret; - - if (cmd == RXRPC_CMD_ACCEPT) { - call = rxrpc_accept_call(rx, user_call_ID); - if (IS_ERR(call)) - return PTR_ERR(call); - rxrpc_put_call(call); - return 0; - } - - call = rxrpc_find_server_call(rx, user_call_ID); - if (!call) - return -EBADSLT; - if (call->state >= RXRPC_CALL_COMPLETE) { - ret = -ESHUTDOWN; - goto out; - } - - switch (cmd) { - case RXRPC_CMD_SEND_DATA: - if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && - call->state != RXRPC_CALL_SERVER_ACK_REQUEST && - call->state != RXRPC_CALL_SERVER_SEND_REPLY) { - /* Tx phase not yet begun for this call */ - ret = -EPROTO; - break; - } - - ret = rxrpc_send_data(rx, call, msg, len); - break; - - case RXRPC_CMD_SEND_ABORT: - rxrpc_send_abort(call, abort_code); - break; - default: - BUG(); - } - - out: - rxrpc_put_call(call); - _leave(" = %d", ret); - return ret; -} - /* * send a packet through the transport endpoint */ -- cgit v1.2.3 From e434863718d4b99dd0d6e0cefd3c5e79e4fa2083 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 9 Jun 2016 10:21:00 -0700 Subject: net: vrf: Fix crash when IPv6 is disabled at boot time Frank Kellermann reported a kernel crash with 4.5.0 when IPv6 is disabled at boot using the kernel option ipv6.disable=1. Using current net-next with the boot option: $ ip link add red type vrf table 1001 Generates: [12210.919584] BUG: unable to handle kernel NULL pointer dereference at 0000000000000748 [12210.921341] IP: [] fib6_get_table+0x2c/0x5a [12210.922537] PGD b79e3067 PUD bb32b067 PMD 0 [12210.923479] Oops: 0000 [#1] SMP [12210.924001] Modules linked in: ipvlan 8021q garp mrp stp llc [12210.925130] CPU: 3 PID: 1177 Comm: ip Not tainted 4.7.0-rc1+ #235 [12210.926168] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [12210.928065] task: ffff8800b9ac4640 ti: ffff8800bacac000 task.ti: ffff8800bacac000 [12210.929328] RIP: 0010:[] [] fib6_get_table+0x2c/0x5a [12210.930697] RSP: 0018:ffff8800bacaf888 EFLAGS: 00010202 [12210.931563] RAX: 0000000000000748 RBX: ffffffff81a9e280 RCX: ffff8800b9ac4e28 [12210.932688] RDX: 00000000000000e9 RSI: 0000000000000002 RDI: 0000000000000286 [12210.933820] RBP: ffff8800bacaf898 R08: ffff8800b9ac4df0 R09: 000000000052001b [12210.934941] R10: 00000000657c0000 R11: 000000000000c649 R12: 00000000000003e9 [12210.936032] R13: 00000000000003e9 R14: ffff8800bace7800 R15: ffff8800bb3ec000 [12210.937103] FS: 00007faa1766c700(0000) GS:ffff88013ac00000(0000) knlGS:0000000000000000 [12210.938321] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [12210.939166] CR2: 0000000000000748 CR3: 00000000b79d6000 CR4: 00000000000406e0 [12210.940278] Stack: [12210.940603] ffff8800bb3ec000 ffffffff81a9e280 ffff8800bacaf8c8 ffffffff814b3135 [12210.941818] ffff8800bb3ec000 ffffffff81a9e280 ffffffff81a9e280 ffff8800bace7800 [12210.943040] ffff8800bacaf8f0 ffffffff81397c88 ffff8800bb3ec000 ffffffff81a9e280 [12210.944288] Call Trace: [12210.944688] [] fib6_new_table+0x24/0x8a [12210.945516] [] vrf_dev_init+0xd4/0x162 [12210.946328] [] register_netdevice+0x100/0x396 [12210.947209] [] vrf_newlink+0x40/0xb3 [12210.948001] [] rtnl_newlink+0x5d3/0x6d5 ... The problem above is due to the fact that the fib hash table is not allocated when IPv6 is disabled at boot. As for the VRF driver it should not do any IPv6 initializations if IPv6 is disabled, so it needs to know if IPv6 is disabled at boot. The disable parameter is private to the IPv6 module, so provide an accessor for modules to determine if IPv6 was disabled at boot time. Fixes: 35402e3136634 ("net: Add IPv6 support to VRF device") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index bfa86f040c16..2076c21107d0 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -92,6 +92,12 @@ MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces"); module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444); MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces"); +bool ipv6_mod_enabled(void) +{ + return disable_ipv6_mod == 0; +} +EXPORT_SYMBOL_GPL(ipv6_mod_enabled); + static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); -- cgit v1.2.3 From f2a4d086ed4c588d32fe9b7aa67fead7280e7bf1 Mon Sep 17 00:00:00 2001 From: William Tu Date: Fri, 10 Jun 2016 11:49:33 -0700 Subject: openvswitch: Add packet truncation support. The patch adds a new OVS action, OVS_ACTION_ATTR_TRUNC, in order to truncate packets. A 'max_len' is added for setting up the maximum packet size, and a 'cutlen' field is to record the number of bytes to trim the packet when the packet is outputting to a port, or when the packet is sent to userspace. Signed-off-by: William Tu Cc: Pravin Shelar Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 40 ++++++++++++++++++++++++++++++++++++---- net/openvswitch/datapath.c | 29 +++++++++++++++++------------ net/openvswitch/datapath.h | 5 ++++- net/openvswitch/flow_netlink.c | 9 +++++++++ net/openvswitch/vport.c | 1 + 5 files changed, 67 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 9a3eb7a0ebf4..1ecbd7715f6d 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -750,6 +750,14 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, if (likely(vport)) { u16 mru = OVS_CB(skb)->mru; + u32 cutlen = OVS_CB(skb)->cutlen; + + if (unlikely(cutlen > 0)) { + if (skb->len - cutlen > ETH_HLEN) + pskb_trim(skb, skb->len - cutlen); + else + pskb_trim(skb, ETH_HLEN); + } if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { ovs_vport_send(vport, skb); @@ -775,7 +783,8 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, static int output_userspace(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, - const struct nlattr *actions, int actions_len) + const struct nlattr *actions, int actions_len, + uint32_t cutlen) { struct dp_upcall_info upcall; const struct nlattr *a; @@ -822,7 +831,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, } /* End of switch. */ } - return ovs_dp_upcall(dp, skb, key, &upcall); + return ovs_dp_upcall(dp, skb, key, &upcall, cutlen); } static int sample(struct datapath *dp, struct sk_buff *skb, @@ -832,6 +841,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, const struct nlattr *acts_list = NULL; const struct nlattr *a; int rem; + u32 cutlen = 0; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { @@ -858,13 +868,24 @@ static int sample(struct datapath *dp, struct sk_buff *skb, return 0; /* The only known usage of sample action is having a single user-space + * action, or having a truncate action followed by a single user-space * action. Treat this usage as a special case. * The output_userspace() should clone the skb to be sent to the * user space. This skb will be consumed by its caller. */ + if (unlikely(nla_type(a) == OVS_ACTION_ATTR_TRUNC)) { + struct ovs_action_trunc *trunc = nla_data(a); + + if (skb->len > trunc->max_len) + cutlen = skb->len - trunc->max_len; + + a = nla_next(a, &rem); + } + if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && nla_is_last(a, rem))) - return output_userspace(dp, skb, key, a, actions, actions_len); + return output_userspace(dp, skb, key, a, actions, + actions_len, cutlen); skb = skb_clone(skb, GFP_ATOMIC); if (!skb) @@ -1051,6 +1072,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, if (out_skb) do_output(dp, out_skb, prev_port, key); + OVS_CB(skb)->cutlen = 0; prev_port = -1; } @@ -1059,8 +1081,18 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, prev_port = nla_get_u32(a); break; + case OVS_ACTION_ATTR_TRUNC: { + struct ovs_action_trunc *trunc = nla_data(a); + + if (skb->len > trunc->max_len) + OVS_CB(skb)->cutlen = skb->len - trunc->max_len; + break; + } + case OVS_ACTION_ATTR_USERSPACE: - output_userspace(dp, skb, key, a, attr, len); + output_userspace(dp, skb, key, a, attr, + len, OVS_CB(skb)->cutlen); + OVS_CB(skb)->cutlen = 0; break; case OVS_ACTION_ATTR_HASH: diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 856bd8dba676..673934295333 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -137,10 +137,12 @@ EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held); static struct vport *new_vport(const struct vport_parms *); static int queue_gso_packets(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, - const struct dp_upcall_info *); + const struct dp_upcall_info *, + uint32_t cutlen); static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, - const struct dp_upcall_info *); + const struct dp_upcall_info *, + uint32_t cutlen); /* Must be called with rcu_read_lock. */ static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) @@ -275,7 +277,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) upcall.cmd = OVS_PACKET_CMD_MISS; upcall.portid = ovs_vport_find_upcall_portid(p, skb); upcall.mru = OVS_CB(skb)->mru; - error = ovs_dp_upcall(dp, skb, key, &upcall); + error = ovs_dp_upcall(dp, skb, key, &upcall, 0); if (unlikely(error)) kfree_skb(skb); else @@ -300,7 +302,8 @@ out: int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info) + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { struct dp_stats_percpu *stats; int err; @@ -311,9 +314,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, } if (!skb_is_gso(skb)) - err = queue_userspace_packet(dp, skb, key, upcall_info); + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else - err = queue_gso_packets(dp, skb, key, upcall_info); + err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); if (err) goto err; @@ -331,7 +334,8 @@ err: static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info) + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { unsigned short gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; @@ -360,7 +364,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; - err = queue_userspace_packet(dp, skb, key, upcall_info); + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; @@ -416,7 +420,8 @@ static void pad_packet(struct datapath *dp, struct sk_buff *skb) static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info) + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { struct ovs_header *upcall; struct sk_buff *nskb = NULL; @@ -461,7 +466,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, else hlen = skb->len; - len = upcall_msg_size(upcall_info, hlen); + len = upcall_msg_size(upcall_info, hlen - cutlen); user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; @@ -515,9 +520,9 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, err = -ENOBUFS; goto out; } - nla->nla_len = nla_attr_size(skb->len); + nla->nla_len = nla_attr_size(skb->len - cutlen); - err = skb_zerocopy(user_skb, skb, skb->len, hlen); + err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen); if (err) goto out; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 427e39a045cf..ab85c1cae255 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -100,11 +100,13 @@ struct datapath { * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. * @mru: The maximum received fragement size; 0 if the packet is not + * @cutlen: The number of bytes from the packet end to be removed. * fragmented. */ struct ovs_skb_cb { struct vport *input_vport; u16 mru; + u32 cutlen; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -194,7 +196,8 @@ extern struct genl_family dp_vport_genl_family; void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, - const struct sw_flow_key *, const struct dp_upcall_info *); + const struct sw_flow_key *, const struct dp_upcall_info *, + uint32_t cutlen); const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 0bb650f4f219..c78a6a1476fb 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2229,6 +2229,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), [OVS_ACTION_ATTR_CT] = (u32)-1, + [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2255,6 +2256,14 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_TRUNC: { + const struct ovs_action_trunc *trunc = nla_data(a); + + if (trunc->max_len < ETH_HLEN) + return -EINVAL; + break; + } + case OVS_ACTION_ATTR_HASH: { const struct ovs_action_hash *act_hash = nla_data(a); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 31cbc8c5c7db..6b21fd068d87 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -444,6 +444,7 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, OVS_CB(skb)->input_vport = vport; OVS_CB(skb)->mru = 0; + OVS_CB(skb)->cutlen = 0; if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { u32 mark; -- cgit v1.2.3 From a70b506efe899dc8d650eafcc0b11fc9ee746627 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 10 Jun 2016 21:19:06 +0200 Subject: bpf: enforce recursion limit on redirects Respect the stack's xmit_recursion limit for calls into dev_queue_xmit(). Currently, they are not handeled by the limiter when attached to clsact's egress parent, for example, and a buggy program redirecting it to the same device again could run into stack overflow eventually. It would be good if we could notify an admin to give him a chance to react. We reuse xmit_recursion instead of having one private to eBPF, so that the stack's current recursion depth will be taken into account as well. Follow-up to commit 3896d655f4d4 ("bpf: introduce bpf_clone_redirect() helper") and 27b29f63058d ("bpf: add bpf_redirect() helper"). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++---- net/core/filter.c | 55 ++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 36 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c43c9d2a88cf..b14835757141 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3144,8 +3144,6 @@ static void skb_update_prio(struct sk_buff *skb) DEFINE_PER_CPU(int, xmit_recursion); EXPORT_SYMBOL(xmit_recursion); -#define RECURSION_LIMIT 10 - /** * dev_loopback_xmit - loop back @skb * @net: network namespace this loopback is happening in @@ -3388,8 +3386,8 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) int cpu = smp_processor_id(); /* ok because BHs are off */ if (txq->xmit_lock_owner != cpu) { - - if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + if (unlikely(__this_cpu_read(xmit_recursion) > + XMIT_RECURSION_LIMIT)) goto recursion_alert; skb = validate_xmit_skb(skb, dev); diff --git a/net/core/filter.c b/net/core/filter.c index 68adb5f52110..d11744d10e00 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1603,9 +1603,36 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .arg5_type = ARG_ANYTHING, }; +static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) +{ + if (skb_at_tc_ingress(skb)) + skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); + + return dev_forward_skb(dev, skb); +} + +static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) +{ + int ret; + + if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + kfree_skb(skb); + return -ENETDOWN; + } + + skb->dev = dev; + + __this_cpu_inc(xmit_recursion); + ret = dev_queue_xmit(skb); + __this_cpu_dec(xmit_recursion); + + return ret; +} + static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) { - struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; + struct sk_buff *skb = (struct sk_buff *) (long) r1; struct net_device *dev; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1615,19 +1642,12 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!dev)) return -EINVAL; - skb2 = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!skb2)) + skb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!skb)) return -ENOMEM; - if (flags & BPF_F_INGRESS) { - if (skb_at_tc_ingress(skb2)) - skb_postpush_rcsum(skb2, skb_mac_header(skb2), - skb2->mac_len); - return dev_forward_skb(dev, skb2); - } - - skb2->dev = dev; - return dev_queue_xmit(skb2); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); } static const struct bpf_func_proto bpf_clone_redirect_proto = { @@ -1671,15 +1691,8 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - if (ri->flags & BPF_F_INGRESS) { - if (skb_at_tc_ingress(skb)) - skb_postpush_rcsum(skb, skb_mac_header(skb), - skb->mac_len); - return dev_forward_skb(dev, skb); - } - - skb->dev = dev; - return dev_queue_xmit(skb); + return ri->flags & BPF_F_INGRESS ? + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); } static const struct bpf_func_proto bpf_redirect_proto = { -- cgit v1.2.3 From f7bd9e36ee4a4ce38e1cddd7effe6c0d9943285b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 10 Jun 2016 21:19:07 +0200 Subject: bpf: reject wrong sized filters earlier Add a bpf_check_basics_ok() and reject filters that are of invalid size much earlier, so we don't do any useless work such as invoking bpf_prog_alloc(). Currently, rejection happens in bpf_check_classic() only, but it's really unnecessarily late and they should be rejected at earliest point. While at it, also clean up one bpf_prog_size() to make it consistent with the remaining invocations. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index d11744d10e00..df6860c85d72 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -748,6 +748,17 @@ static bool chk_code_allowed(u16 code_to_probe) return codes[code_to_probe]; } +static bool bpf_check_basics_ok(const struct sock_filter *filter, + unsigned int flen) +{ + if (filter == NULL) + return false; + if (flen == 0 || flen > BPF_MAXINSNS) + return false; + + return true; +} + /** * bpf_check_classic - verify socket filter code * @filter: filter to verify @@ -768,9 +779,6 @@ static int bpf_check_classic(const struct sock_filter *filter, bool anc_found; int pc; - if (flen == 0 || flen > BPF_MAXINSNS) - return -EINVAL; - /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; @@ -1065,7 +1073,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL) + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); @@ -1112,7 +1120,7 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, int err; /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL) + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); @@ -1207,7 +1215,6 @@ static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); - unsigned int bpf_fsize = bpf_prog_size(fprog->len); struct bpf_prog *prog; int err; @@ -1215,10 +1222,10 @@ struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL) + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return ERR_PTR(-EINVAL); - prog = bpf_prog_alloc(bpf_fsize, 0); + prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!prog) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From ea7f8277f9076d71ed6a925e2835ef4b85d6f5e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 10 Jun 2016 23:10:22 +0200 Subject: net, cls: allow for deleting all filters for given parent Add a possibility where the user can just specify the parent and all filters under that parent are then being purged. Currently, for example for scripting, one needs to specify pref/prio to have a well-defined number for 'tc filter del' command for addressing the previously created instance or additionally filter handle in case of priorities being the same. Improve usage by allowing the option for tc to specify the parent and removing the whole chain for that given parent. Example usage after patch, no tc changes required: # tc qdisc replace dev foo clsact # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter show dev foo egress filter protocol all pref 49151 bpf filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action # tc filter del dev foo egress # tc filter show dev foo egress # Previously, RTM_DELTFILTER requests with invalid prio of 0 were rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE flag were allowed where the kernel would auto-generate a pref/prio. We can piggyback on that and use prio of 0 as a wildcard for requests of RTM_DELTFILTER. For notifying tc netlink monitoring users (e.g. libnl uses this for caching), there are two options, that is, sending individual tfilter_notify() notifications for each tcf_proto, or sending a single one indicating wildcard removal. I tried both and there are pros and cons for each, eventually I decided for sending individual tfilter_notify(), so that user space can support this seamlessly and there won't be a mess of changing each and every application to make sure expectations from the kernel won't break when they don't understand single notification. Since linear chains don't really scale, I expect only a handful of classifiers to be attached at max for a given parent anyway. Signed-off-by: Daniel Borkmann Acked-by: Jamal Hadi Salim Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/sched/cls_api.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index aafa6bce173e..cca1ef5e5476 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -103,6 +103,17 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, unsigned long fh, int event); +static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, + struct tcf_proto __rcu **chain, int event) +{ + struct tcf_proto __rcu **it_chain; + struct tcf_proto *tp; + + for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL; + it_chain = &tp->next) + tfilter_notify(net, oskb, n, tp, 0, event); +} /* Select new prio value from the range, managed by kernel. */ @@ -156,11 +167,23 @@ replay: cl = 0; if (prio == 0) { - /* If no priority is given, user wants we allocated it. */ - if (n->nlmsg_type != RTM_NEWTFILTER || - !(n->nlmsg_flags & NLM_F_CREATE)) + switch (n->nlmsg_type) { + case RTM_DELTFILTER: + if (protocol || t->tcm_handle) + return -ENOENT; + break; + case RTM_NEWTFILTER: + /* If no priority is provided by the user, + * we allocate one. + */ + if (n->nlmsg_flags & NLM_F_CREATE) { + prio = TC_H_MAKE(0x80000000U, 0U); + break; + } + /* fall-through */ + default: return -ENOENT; - prio = TC_H_MAKE(0x80000000U, 0U); + } } /* Find head of filter chain. */ @@ -200,6 +223,12 @@ replay: err = -EINVAL; if (chain == NULL) goto errout; + if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { + tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); + tcf_destroy_chain(chain); + err = 0; + goto errout; + } /* Check the chain for existence of proto-tcf with this priority */ for (back = chain; -- cgit v1.2.3 From bc6e1ea32c26ead06063a882e802fff7ab6535c2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 10 Jun 2016 22:30:27 +0100 Subject: rxrpc: Trim line-terminal whitespace Trim line-terminal whitespace in net/rxrpc/ Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/ar-input.c | 2 +- net/rxrpc/ar-local.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index d7c2a0bc839e..e0815a033999 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -734,7 +734,7 @@ void rxrpc_data_ready(struct sock *sk) rxrpc_post_packet_to_local(local, skb); goto out; } - + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) goto bad_message; diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c index 701c42b7050e..111f250b045f 100644 --- a/net/rxrpc/ar-local.c +++ b/net/rxrpc/ar-local.c @@ -388,7 +388,7 @@ static void rxrpc_process_local_events(struct work_struct *work) _enter(""); atomic_inc(&local->usage); - + while ((skb = skb_dequeue(&local->event_queue))) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); -- cgit v1.2.3 From 0e119b41b7f23e08799fa8b1c9c1360d7da75815 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 10 Jun 2016 22:30:37 +0100 Subject: rxrpc: Limit the listening backlog Limit the socket incoming call backlog queue size so that a remote client can't pump in sufficient new calls that the server runs out of memory. Note that this is partially theoretical at the moment since whilst the number of calls is limited, the number of packets trying to set up new calls is not. This will be addressed in a later patch. If the caller of listen() specifies a backlog INT_MAX, then they get the current maximum; anything else greater than max_backlog or anything negative incurs EINVAL. The limit on the maximum queue size can be set by: echo N >/proc/sys/net/rxrpc/max_backlog where 4<=N<=32. Further, set the default backlog to 0, requiring listen() to be called before we start actually queueing new calls. Whilst this kind of is a change in the UAPI, the caller can't actually *accept* new calls anyway unless they've first called listen() to put the socket into the LISTENING state - thus the aforementioned new calls would otherwise just sit there, eating up kernel memory. (Note that sockets that don't have a non-zero service ID bound don't get incoming calls anyway.) Given that the default backlog is now 0, make the AFS filesystem call kernel_listen() to set the maximum backlog for itself. Possible improvements include: (1) Trimming a too-large backlog to max_backlog when listen is called. (2) Trimming the backlog value whenever the value is used so that changes to max_backlog are applied to an open socket automatically. Note that the AFS filesystem opens one socket and keeps it open for extended periods, so would miss out on changes to max_backlog. (3) Having a separate setting for the AFS filesystem. Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/af_rxrpc.c | 19 +++++++++++-------- net/rxrpc/ar-internal.h | 1 + net/rxrpc/misc.c | 6 ++++++ net/rxrpc/sysctl.c | 10 ++++++++++ 4 files changed, 28 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 38512a200db6..a1bcb0e17250 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -33,8 +33,6 @@ unsigned int rxrpc_debug; // = RXRPC_DEBUG_KPROTO; module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(debug, "RxRPC debugging mask"); -static int sysctl_rxrpc_max_qlen __read_mostly = 10; - static struct proto rxrpc_proto; static const struct proto_ops rxrpc_rpc_ops; @@ -191,6 +189,7 @@ static int rxrpc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); + unsigned int max; int ret; _enter("%p,%d", rx, backlog); @@ -201,17 +200,21 @@ static int rxrpc_listen(struct socket *sock, int backlog) case RXRPC_UNBOUND: ret = -EADDRNOTAVAIL; break; - case RXRPC_CLIENT_UNBOUND: - case RXRPC_CLIENT_BOUND: - default: - ret = -EBUSY; - break; case RXRPC_SERVER_BOUND: ASSERT(rx->local != NULL); + max = READ_ONCE(rxrpc_max_backlog); + ret = -EINVAL; + if (backlog == INT_MAX) + backlog = max; + else if (backlog < 0 || backlog > max) + break; sk->sk_max_ack_backlog = backlog; rx->sk.sk_state = RXRPC_SERVER_LISTENING; ret = 0; break; + default: + ret = -EBUSY; + break; } release_sock(&rx->sk); @@ -591,7 +594,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, sock_init_data(sock, sk); sk->sk_state = RXRPC_UNBOUND; sk->sk_write_space = rxrpc_write_space; - sk->sk_max_ack_backlog = sysctl_rxrpc_max_qlen; + sk->sk_max_ack_backlog = 0; sk->sk_destruct = rxrpc_sock_destructor; rx = rxrpc_sk(sk); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index b89dcdcbc65a..f715cca767cd 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -641,6 +641,7 @@ extern const struct rxrpc_security rxrpc_no_security; /* * misc.c */ +extern unsigned int rxrpc_max_backlog __read_mostly; extern unsigned int rxrpc_requested_ack_delay; extern unsigned int rxrpc_soft_ack_delay; extern unsigned int rxrpc_idle_ack_delay; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 1afe9876e79f..bdc5e42fe600 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -14,6 +14,12 @@ #include #include "ar-internal.h" +/* + * The maximum listening backlog queue size that may be set on a socket by + * listen(). + */ +unsigned int rxrpc_max_backlog __read_mostly = 10; + /* * How long to wait before scheduling ACK generation after seeing a * packet with RXRPC_REQUEST_ACK set (in jiffies). diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index d20ed575acf4..a99690a8a3da 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -18,6 +18,7 @@ static struct ctl_table_header *rxrpc_sysctl_reg_table; static const unsigned int zero = 0; static const unsigned int one = 1; static const unsigned int four = 4; +static const unsigned int thirtytwo = 32; static const unsigned int n_65535 = 65535; static const unsigned int n_max_acks = RXRPC_MAXACKS; @@ -99,6 +100,15 @@ static struct ctl_table rxrpc_sysctl_table[] = { }, /* Non-time values */ + { + .procname = "max_backlog", + .data = &rxrpc_max_backlog, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&four, + .extra2 = (void *)&thirtytwo, + }, { .procname = "rx_window_size", .data = &rxrpc_rx_window_size, -- cgit v1.2.3 From 231edca97f947f50e18fa2f6d9c8285c3314232b Mon Sep 17 00:00:00 2001 From: Bhaktipriya Shridhar Date: Wed, 8 Jun 2016 01:03:45 +0530 Subject: RDS: IB: Remove deprecated create_workqueue alloc_workqueue replaces deprecated create_workqueue(). Since the driver is infiniband which can be used as block device and the workqueue seems involved in regular operation of the device, so a dedicated workqueue has been used with WQ_MEM_RECLAIM set to guarantee forward progress under memory pressure. Since there are only a fixed number of work items, explicit concurrency limit is unnecessary here. Signed-off-by: Bhaktipriya Shridhar Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index f7164ac1ffc1..a0f21b65a83c 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -618,7 +618,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, int rds_ib_mr_init(void) { - rds_ib_mr_wq = create_workqueue("rds_mr_flushd"); + rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0); if (!rds_ib_mr_wq) return -ENOMEM; return 0; -- cgit v1.2.3 From 1276f24eeef207e5572649d069ba1b531b2e2c3a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 8 Jun 2016 16:09:22 +0300 Subject: packet: use common code for virtio_net_hdr and skb GSO conversion Replace open coded conversion between virtio_net_hdr to skb GSO info with virtio_net_hdr_from_skb Signed-off-by: Mike Rapoport Signed-off-by: David S. Miller --- net/packet/af_packet.c | 36 ++---------------------------------- 1 file changed, 2 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9bff6ef16fa7..d1f3b9e977e5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1979,40 +1979,8 @@ static int __packet_rcv_vnet(const struct sk_buff *skb, { *vnet_hdr = (const struct virtio_net_hdr) { 0 }; - if (skb_is_gso(skb)) { - struct skb_shared_info *sinfo = skb_shinfo(skb); - - /* This is a hint as to how much should be linear. */ - vnet_hdr->hdr_len = - __cpu_to_virtio16(vio_le(), skb_headlen(skb)); - vnet_hdr->gso_size = - __cpu_to_virtio16(vio_le(), sinfo->gso_size); - - if (sinfo->gso_type & SKB_GSO_TCPV4) - vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - else if (sinfo->gso_type & SKB_GSO_TCPV6) - vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; - else if (sinfo->gso_type & SKB_GSO_UDP) - vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; - else if (sinfo->gso_type & SKB_GSO_FCOE) - return -EINVAL; - else - BUG(); - - if (sinfo->gso_type & SKB_GSO_TCP_ECN) - vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; - } else - vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet_hdr->csum_start = __cpu_to_virtio16(vio_le(), - skb_checksum_start_offset(skb)); - vnet_hdr->csum_offset = __cpu_to_virtio16(vio_le(), - skb->csum_offset); - } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; - } /* else everything is zero */ + if (virtio_net_hdr_from_skb(skb, vnet_hdr, vio_le())) + BUG(); return 0; } -- cgit v1.2.3 From 6f094b9ec680209c5b7314feee983b2f4c910b1b Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Wed, 8 Jun 2016 21:16:44 -0700 Subject: tcp: add in_flight to tcp_skb_cb Add in_flight (bytes in flight when packet was sent) field to tx component of tcp_skb_cb and make it available to congestion modules' pkts_acked() function through the ack_sample function argument. Signed-off-by: Lawrence Brakmo Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 5 ++++- net/ipv4/tcp_output.c | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 89dd8d82826f..94d4aff97523 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3115,6 +3115,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, long ca_rtt_us = -1L; struct sk_buff *skb; u32 pkts_acked = 0; + u32 last_in_flight = 0; bool rtt_update; int flag = 0; @@ -3154,6 +3155,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!first_ackt.v64) first_ackt = last_ackt; + last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; reord = min(pkts_acked, reord); if (!after(scb->end_seq, tp->high_seq)) flag |= FLAG_ORIG_SACK_ACKED; @@ -3250,7 +3252,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = ca_rtt_us }; + .rtt_us = ca_rtt_us, + .in_flight = last_in_flight }; icsk->icsk_ca_ops->pkts_acked(sk, &sample); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8bd9911fdd16..b1bcba0563f2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -911,9 +911,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); + tp = tcp_sk(sk); if (clone_it) { skb_mstamp_get(&skb->skb_mstamp); + TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq + - tp->snd_una; if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); @@ -924,7 +927,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } inet = inet_sk(sk); - tp = tcp_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); -- cgit v1.2.3 From 699fafafab6d765f12367b3ce0816e64ae19d1e8 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Wed, 8 Jun 2016 21:16:45 -0700 Subject: tcp: add NV congestion control TCP-NV (New Vegas) is a major update to TCP-Vegas. An earlier version of NV was presented at 2010's LPC. It is a delayed based congestion avoidance for the data center. This version has been tested within a 10G rack where the HW RTTs are 20-50us and with 1 to 400 flows. A description of TCP-NV, including implementation details as well as experimental results, can be found at: http://www.brakmo.org/networking/tcp-nv/TCPNV.html Signed-off-by: Lawrence Brakmo Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 16 ++ net/ipv4/Makefile | 1 + net/ipv4/tcp_nv.c | 476 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 493 insertions(+) create mode 100644 net/ipv4/tcp_nv.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 238225b0c970..50d6a9b49f6c 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -532,6 +532,22 @@ config TCP_CONG_VEGAS window. TCP Vegas should provide less packet loss, but it is not as aggressive as TCP Reno. +config TCP_CONG_NV + tristate "TCP NV" + default n + ---help--- + TCP NV is a follow up to TCP Vegas. It has been modified to deal with + 10G networks, measurement noise introduced by LRO, GRO and interrupt + coalescence. In addition, it will decrease its cwnd multiplicatively + instead of linearly. + + Note that in general congestion avoidance (cwnd decreased when # packets + queued grows) cannot coexist with congestion control (cwnd decreased only + when there is packet loss) due to fairness issues. One scenario when they + can coexist safely is when the CA flows have RTTs << CC flows RTTs. + + For further details see http://www.brakmo.org/networking/tcp-nv/ + config TCP_CONG_SCALABLE tristate "Scalable TCP" default n diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index bfa133691cde..24629b6f57cc 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c new file mode 100644 index 000000000000..5de82a8d4d87 --- /dev/null +++ b/net/ipv4/tcp_nv.c @@ -0,0 +1,476 @@ +/* + * TCP NV: TCP with Congestion Avoidance + * + * TCP-NV is a successor of TCP-Vegas that has been developed to + * deal with the issues that occur in modern networks. + * Like TCP-Vegas, TCP-NV supports true congestion avoidance, + * the ability to detect congestion before packet losses occur. + * When congestion (queue buildup) starts to occur, TCP-NV + * predicts what the cwnd size should be for the current + * throughput and it reduces the cwnd proportionally to + * the difference between the current cwnd and the predicted cwnd. + * + * NV is only recommeneded for traffic within a data center, and when + * all the flows are NV (at least those within the data center). This + * is due to the inherent unfairness between flows using losses to + * detect congestion (congestion control) and those that use queue + * buildup to detect congestion (congestion avoidance). + * + * Note: High NIC coalescence values may lower the performance of NV + * due to the increased noise in RTT values. In particular, we have + * seen issues with rx-frames values greater than 8. + * + * TODO: + * 1) Add mechanism to deal with reverse congestion. + */ + +#include +#include +#include +#include +#include + +/* TCP NV parameters + * + * nv_pad Max number of queued packets allowed in network + * nv_pad_buffer Do not grow cwnd if this closed to nv_pad + * nv_reset_period How often (in) seconds)to reset min_rtt + * nv_min_cwnd Don't decrease cwnd below this if there are no losses + * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected + * nv_ssthresh_factor On congestion set ssthresh to this * / 8 + * nv_rtt_factor RTT averaging factor + * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur + * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd + * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd + * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping + * slow-start due to congestion + * nv_stop_rtt_cnt Only grow cwnd for this many RTTs after non-congestion + * nv_rtt_min_cnt Wait these many RTTs before making congesion decision + * nv_cwnd_growth_rate_neg + * nv_cwnd_growth_rate_pos + * How quickly to double growth rate (not rate) of cwnd when not + * congested. One value (nv_cwnd_growth_rate_neg) for when + * rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos) + * otherwise. + */ + +static int nv_pad __read_mostly = 10; +static int nv_pad_buffer __read_mostly = 2; +static int nv_reset_period __read_mostly = 5; /* in seconds */ +static int nv_min_cwnd __read_mostly = 2; +static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ +static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ +static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ +static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */ +static int nv_cwnd_growth_rate_neg __read_mostly = 8; +static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ +static int nv_dec_eval_min_calls __read_mostly = 60; +static int nv_inc_eval_min_calls __read_mostly = 20; +static int nv_ssthresh_eval_min_calls __read_mostly = 30; +static int nv_stop_rtt_cnt __read_mostly = 10; +static int nv_rtt_min_cnt __read_mostly = 2; + +module_param(nv_pad, int, 0644); +MODULE_PARM_DESC(nv_pad, "max queued packets allowed in network"); +module_param(nv_reset_period, int, 0644); +MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)"); +module_param(nv_min_cwnd, int, 0644); +MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value" + " without losses"); + +/* TCP NV Parameters */ +struct tcpnv { + unsigned long nv_min_rtt_reset_jiffies; /* when to switch to + * nv_min_rtt_new */ + s8 cwnd_growth_factor; /* Current cwnd growth factor, + * < 0 => less than 1 packet/RTT */ + u8 available8; + u16 available16; + u32 loss_cwnd; /* cwnd at last loss */ + u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */ + nv_reset:1, /* whether to reset values */ + nv_catchup:1; /* whether we are growing because + * of temporary cwnd decrease */ + u8 nv_eval_call_cnt; /* call count since last eval */ + u8 nv_min_cwnd; /* nv won't make a ca decision if cwnd is + * smaller than this. It may grow to handle + * TSO, LRO and interrupt coalescence because + * with these a small cwnd cannot saturate + * the link. Note that this is different from + * the file local nv_min_cwnd */ + u8 nv_rtt_cnt; /* RTTs without making ca decision */; + u32 nv_last_rtt; /* last rtt */ + u32 nv_min_rtt; /* active min rtt. Used to determine slope */ + u32 nv_min_rtt_new; /* min rtt for future use */ + u32 nv_rtt_max_rate; /* max rate seen during current RTT */ + u32 nv_rtt_start_seq; /* current RTT ends when packet arrives + * acking beyond nv_rtt_start_seq */ + u32 nv_last_snd_una; /* Previous value of tp->snd_una. It is + * used to determine bytes acked since last + * call to bictcp_acked */ + u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */ +}; + +#define NV_INIT_RTT U32_MAX +#define NV_MIN_CWND 4 +#define NV_MIN_CWND_GROW 2 +#define NV_TSO_CWND_BOUND 80 + +static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + ca->nv_reset = 0; + ca->loss_cwnd = 0; + ca->nv_no_cong_cnt = 0; + ca->nv_rtt_cnt = 0; + ca->nv_last_rtt = 0; + ca->nv_rtt_max_rate = 0; + ca->nv_rtt_start_seq = tp->snd_una; + ca->nv_eval_call_cnt = 0; + ca->nv_last_snd_una = tp->snd_una; +} + +static void tcpnv_init(struct sock *sk) +{ + struct tcpnv *ca = inet_csk_ca(sk); + + tcpnv_reset(ca, sk); + + ca->nv_allow_cwnd_growth = 1; + ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; + ca->nv_min_rtt = NV_INIT_RTT; + ca->nv_min_rtt_new = NV_INIT_RTT; + ca->nv_min_cwnd = NV_MIN_CWND; + ca->nv_catchup = 0; + ca->cwnd_growth_factor = 0; +} + +static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcpnv *ca = inet_csk_ca(sk); + u32 cnt; + + if (!tcp_is_cwnd_limited(sk)) + return; + + /* Only grow cwnd if NV has not detected congestion */ + if (!ca->nv_allow_cwnd_growth) + return; + + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; + } + + if (ca->cwnd_growth_factor < 0) { + cnt = tp->snd_cwnd << -ca->cwnd_growth_factor; + tcp_cong_avoid_ai(tp, cnt, acked); + } else { + cnt = max(4U, tp->snd_cwnd >> ca->cwnd_growth_factor); + tcp_cong_avoid_ai(tp, cnt, acked); + } +} + +static u32 tcpnv_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcpnv *ca = inet_csk_ca(sk); + + ca->loss_cwnd = tp->snd_cwnd; + return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); +} + +static u32 tcpnv_undo_cwnd(struct sock *sk) +{ + struct tcpnv *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + +static void tcpnv_state(struct sock *sk, u8 new_state) +{ + struct tcpnv *ca = inet_csk_ca(sk); + + if (new_state == TCP_CA_Open && ca->nv_reset) { + tcpnv_reset(ca, sk); + } else if (new_state == TCP_CA_Loss || new_state == TCP_CA_CWR || + new_state == TCP_CA_Recovery) { + ca->nv_reset = 1; + ca->nv_allow_cwnd_growth = 0; + if (new_state == TCP_CA_Loss) { + /* Reset cwnd growth factor to Reno value */ + if (ca->cwnd_growth_factor > 0) + ca->cwnd_growth_factor = 0; + /* Decrease growth rate if allowed */ + if (nv_cwnd_growth_rate_neg > 0 && + ca->cwnd_growth_factor > -8) + ca->cwnd_growth_factor--; + } + } +} + +/* Do congestion avoidance calculations for TCP-NV + */ +static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcpnv *ca = inet_csk_ca(sk); + unsigned long now = jiffies; + s64 rate64 = 0; + u32 rate, max_win, cwnd_by_slope; + u32 avg_rtt; + u32 bytes_acked = 0; + + /* Some calls are for duplicates without timetamps */ + if (sample->rtt_us < 0) + return; + + /* If not in TCP_CA_Open or TCP_CA_Disorder states, skip. */ + if (icsk->icsk_ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_Disorder) + return; + + /* Stop cwnd growth if we were in catch up mode */ + if (ca->nv_catchup && tp->snd_cwnd >= nv_min_cwnd) { + ca->nv_catchup = 0; + ca->nv_allow_cwnd_growth = 0; + } + + bytes_acked = tp->snd_una - ca->nv_last_snd_una; + ca->nv_last_snd_una = tp->snd_una; + + if (sample->in_flight == 0) + return; + + /* Calculate moving average of RTT */ + if (nv_rtt_factor > 0) { + if (ca->nv_last_rtt > 0) { + avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor + + ((u64)ca->nv_last_rtt) + * (256 - nv_rtt_factor)) >> 8; + } else { + avg_rtt = sample->rtt_us; + ca->nv_min_rtt = avg_rtt << 1; + } + ca->nv_last_rtt = avg_rtt; + } else { + avg_rtt = sample->rtt_us; + } + + /* rate in 100's bits per second */ + rate64 = ((u64)sample->in_flight) * 8000000; + rate = (u32)div64_u64(rate64, (u64)(avg_rtt * 100)); + + /* Remember the maximum rate seen during this RTT + * Note: It may be more than one RTT. This function should be + * called at least nv_dec_eval_min_calls times. + */ + if (ca->nv_rtt_max_rate < rate) + ca->nv_rtt_max_rate = rate; + + /* We have valid information, increment counter */ + if (ca->nv_eval_call_cnt < 255) + ca->nv_eval_call_cnt++; + + /* update min rtt if necessary */ + if (avg_rtt < ca->nv_min_rtt) + ca->nv_min_rtt = avg_rtt; + + /* update future min_rtt if necessary */ + if (avg_rtt < ca->nv_min_rtt_new) + ca->nv_min_rtt_new = avg_rtt; + + /* nv_min_rtt is updated with the minimum (possibley averaged) rtt + * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a + * warm reset). This new nv_min_rtt will be continued to be updated + * and be used for another sysctl_tcp_nv_reset_period seconds, + * when it will be updated again. + * In practice we introduce some randomness, so the actual period used + * is chosen randomly from the range: + * [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4) + */ + if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) { + unsigned char rand; + + ca->nv_min_rtt = ca->nv_min_rtt_new; + ca->nv_min_rtt_new = NV_INIT_RTT; + get_random_bytes(&rand, 1); + ca->nv_min_rtt_reset_jiffies = + now + ((nv_reset_period * (384 + rand) * HZ) >> 9); + /* Every so often we decrease ca->nv_min_cwnd in case previous + * value is no longer accurate. + */ + ca->nv_min_cwnd = max(ca->nv_min_cwnd / 2, NV_MIN_CWND); + } + + /* Once per RTT check if we need to do congestion avoidance */ + if (before(ca->nv_rtt_start_seq, tp->snd_una)) { + ca->nv_rtt_start_seq = tp->snd_nxt; + if (ca->nv_rtt_cnt < 0xff) + /* Increase counter for RTTs without CA decision */ + ca->nv_rtt_cnt++; + + /* If this function is only called once within an RTT + * the cwnd is probably too small (in some cases due to + * tso, lro or interrupt coalescence), so we increase + * ca->nv_min_cwnd. + */ + if (ca->nv_eval_call_cnt == 1 && + bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache && + ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)) { + ca->nv_min_cwnd = min(ca->nv_min_cwnd + + NV_MIN_CWND_GROW, + NV_TSO_CWND_BOUND + 1); + ca->nv_rtt_start_seq = tp->snd_nxt + + ca->nv_min_cwnd * tp->mss_cache; + ca->nv_eval_call_cnt = 0; + ca->nv_allow_cwnd_growth = 1; + return; + } + + /* Find the ideal cwnd for current rate from slope + * slope = 80000.0 * mss / nv_min_rtt + * cwnd_by_slope = nv_rtt_max_rate / slope + */ + cwnd_by_slope = (u32) + div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt, + (u64)(80000 * tp->mss_cache)); + max_win = cwnd_by_slope + nv_pad; + + /* If cwnd > max_win, decrease cwnd + * if cwnd < max_win, grow cwnd + * else leave the same + */ + if (tp->snd_cwnd > max_win) { + /* there is congestion, check that it is ok + * to make a CA decision + * 1. We should have at least nv_dec_eval_min_calls + * data points before making a CA decision + * 2. We only make a congesion decision after + * nv_rtt_min_cnt RTTs + */ + if (ca->nv_rtt_cnt < nv_rtt_min_cnt) { + return; + } else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) { + if (ca->nv_eval_call_cnt < + nv_ssthresh_eval_min_calls) + return; + /* otherwise we will decrease cwnd */ + } else if (ca->nv_eval_call_cnt < + nv_dec_eval_min_calls) { + if (ca->nv_allow_cwnd_growth && + ca->nv_rtt_cnt > nv_stop_rtt_cnt) + ca->nv_allow_cwnd_growth = 0; + return; + } + + /* We have enough data to determine we are congested */ + ca->nv_allow_cwnd_growth = 0; + tp->snd_ssthresh = + (nv_ssthresh_factor * max_win) >> 3; + if (tp->snd_cwnd - max_win > 2) { + /* gap > 2, we do exponential cwnd decrease */ + int dec; + + dec = max(2U, ((tp->snd_cwnd - max_win) * + nv_cong_dec_mult) >> 7); + tp->snd_cwnd -= dec; + } else if (nv_cong_dec_mult > 0) { + tp->snd_cwnd = max_win; + } + if (ca->cwnd_growth_factor > 0) + ca->cwnd_growth_factor = 0; + ca->nv_no_cong_cnt = 0; + } else if (tp->snd_cwnd <= max_win - nv_pad_buffer) { + /* There is no congestion, grow cwnd if allowed*/ + if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls) + return; + + ca->nv_allow_cwnd_growth = 1; + ca->nv_no_cong_cnt++; + if (ca->cwnd_growth_factor < 0 && + nv_cwnd_growth_rate_neg > 0 && + ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) { + ca->cwnd_growth_factor++; + ca->nv_no_cong_cnt = 0; + } else if (ca->cwnd_growth_factor >= 0 && + nv_cwnd_growth_rate_pos > 0 && + ca->nv_no_cong_cnt > + nv_cwnd_growth_rate_pos) { + ca->cwnd_growth_factor++; + ca->nv_no_cong_cnt = 0; + } + } else { + /* cwnd is in-between, so do nothing */ + return; + } + + /* update state */ + ca->nv_eval_call_cnt = 0; + ca->nv_rtt_cnt = 0; + ca->nv_rtt_max_rate = 0; + + /* Don't want to make cwnd < nv_min_cwnd + * (it wasn't before, if it is now is because nv + * decreased it). + */ + if (tp->snd_cwnd < nv_min_cwnd) + tp->snd_cwnd = nv_min_cwnd; + } +} + +/* Extract info for Tcp socket info provided via netlink */ +size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + const struct tcpnv *ca = inet_csk_ca(sk); + + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt; + info->vegas.tcpv_rtt = ca->nv_last_rtt; + info->vegas.tcpv_minrtt = ca->nv_min_rtt; + + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); + } + return 0; +} +EXPORT_SYMBOL_GPL(tcpnv_get_info); + +static struct tcp_congestion_ops tcpnv __read_mostly = { + .init = tcpnv_init, + .ssthresh = tcpnv_recalc_ssthresh, + .cong_avoid = tcpnv_cong_avoid, + .set_state = tcpnv_state, + .undo_cwnd = tcpnv_undo_cwnd, + .pkts_acked = tcpnv_acked, + .get_info = tcpnv_get_info, + + .owner = THIS_MODULE, + .name = "nv", +}; + +static int __init tcpnv_register(void) +{ + BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE); + + return tcp_register_congestion_control(&tcpnv); +} + +static void __exit tcpnv_unregister(void) +{ + tcp_unregister_congestion_control(&tcpnv); +} + +module_init(tcpnv_register); +module_exit(tcpnv_unregister); + +MODULE_AUTHOR("Lawrence Brakmo"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP NV"); +MODULE_VERSION("1.0"); -- cgit v1.2.3 From d46e416c11c88ef1deb5c7f19271806a5be597fe Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 9 Jun 2016 22:48:18 +0800 Subject: sctp: sctp should change socket state when shutdown is received Now sctp doesn't change socket state upon shutdown reception. It changes just the assoc state, even though it's a TCP-style socket. For some cases, if we really need to check sk->sk_state, it's necessary to fix this issue, at least when we use ss or netstat to dump, we can get a more exact information. As an improvement, we will change sk->sk_state when we change asoc->state to SHUTDOWN_RECEIVED, and also do it in sctp_shutdown to keep consistent with sctp_close. Signed-off-by: Xin Long Acked-by: Marcelo R. Leitner Signed-off-by: David S. Miller --- net/sctp/sm_sideeffect.c | 4 +++- net/sctp/socket.c | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index aa3712259368..12d45193357c 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -806,8 +806,10 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds, /* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */ if (sctp_state(asoc, SHUTDOWN_RECEIVED) && - sctp_sstate(sk, ESTABLISHED)) + sctp_sstate(sk, ESTABLISHED)) { + sk->sk_state = SCTP_SS_CLOSING; sk->sk_shutdown |= RCV_SHUTDOWN; + } } if (sctp_state(asoc, COOKIE_WAIT)) { diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 712fb2339baa..6cae4c61ae26 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4195,6 +4195,7 @@ static void sctp_shutdown(struct sock *sk, int how) return; if (how & SEND_SHUTDOWN) { + sk->sk_state = SCTP_SS_CLOSING; ep = sctp_sk(sk)->ep; if (!list_empty(&ep->asocs)) { asoc = list_entry(ep->asocs.next, @@ -7566,10 +7567,13 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, /* If the association on the newsk is already closed before accept() * is called, set RCV_SHUTDOWN flag. */ - if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) + if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) { + newsk->sk_state = SCTP_SS_CLOSING; newsk->sk_shutdown |= RCV_SHUTDOWN; + } else { + newsk->sk_state = SCTP_SS_ESTABLISHED; + } - newsk->sk_state = SCTP_SS_ESTABLISHED; release_sock(newsk); } -- cgit v1.2.3 From 8fe6a79fb8088a759b3dc57eb641fc3183ad72b8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Jun 2016 16:41:36 -0700 Subject: net_sched: sch_plug: use a private throttled status We want to get rid of generic qdisc throttled management, so this qdisc has to use a private flag. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_plug.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index ff0d968750df..a12cd37680f8 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -64,6 +64,8 @@ struct plug_sched_data { */ bool unplug_indefinite; + bool throttled; + /* Queue Limit in bytes */ u32 limit; @@ -103,7 +105,7 @@ static struct sk_buff *plug_dequeue(struct Qdisc *sch) { struct plug_sched_data *q = qdisc_priv(sch); - if (qdisc_is_throttled(sch)) + if (q->throttled) return NULL; if (!q->unplug_indefinite) { @@ -111,7 +113,7 @@ static struct sk_buff *plug_dequeue(struct Qdisc *sch) /* No more packets to dequeue. Block the queue * and wait for the next release command. */ - qdisc_throttled(sch); + q->throttled = true; return NULL; } q->pkts_to_release--; @@ -141,7 +143,7 @@ static int plug_init(struct Qdisc *sch, struct nlattr *opt) q->limit = ctl->limit; } - qdisc_throttled(sch); + q->throttled = true; return 0; } @@ -173,7 +175,7 @@ static int plug_change(struct Qdisc *sch, struct nlattr *opt) q->pkts_last_epoch = q->pkts_current_epoch; q->pkts_current_epoch = 0; if (q->unplug_indefinite) - qdisc_throttled(sch); + q->throttled = true; q->unplug_indefinite = false; break; case TCQ_PLUG_RELEASE_ONE: @@ -182,7 +184,7 @@ static int plug_change(struct Qdisc *sch, struct nlattr *opt) */ q->pkts_to_release += q->pkts_last_epoch; q->pkts_last_epoch = 0; - qdisc_unthrottled(sch); + q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_RELEASE_INDEFINITE: @@ -190,7 +192,7 @@ static int plug_change(struct Qdisc *sch, struct nlattr *opt) q->pkts_to_release = 0; q->pkts_last_epoch = 0; q->pkts_current_epoch = 0; - qdisc_unthrottled(sch); + q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_LIMIT: -- cgit v1.2.3 From cca605dd4b3b2bfa381250b7dbbe16b124916f24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Jun 2016 16:41:37 -0700 Subject: net_sched: cbq: remove a flaky use of qdisc_is_throttled() So far no qdisc ever unset the throttled bit at enqueue() time, so CBQ usage of qdisc_is_throttled() was flaky. Since __QDISC_STATE_THROTTLED set/unset is way too expensive considering that only CBQ was eventually caring for this status, it would make sense to implement a Qdisc ops ->is_throttled() if we find that this is needed. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index f2af31be6370..6e61f9aa8783 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -345,7 +345,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) { int toplevel = q->toplevel; - if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) { + if (toplevel > cl->level) { psched_time_t now = psched_get_time(); do { -- cgit v1.2.3 From 42117927cab5a13192ecc227bea19da5059ffc6c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Jun 2016 16:41:38 -0700 Subject: net_sched: netem: remove qdisc_is_throttled() use Looks like it is only there as some optimization attempt. Since __QDISC_STATE_THROTTLED set/unset is way too expensive, and netem is the last user, just remove this check. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 9ca7947ab643..2dbe732ca135 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -582,9 +582,6 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) struct sk_buff *skb; struct rb_node *p; - if (qdisc_is_throttled(sch)) - return NULL; - tfifo_dequeue: skb = __skb_dequeue(&sch->q); if (skb) { -- cgit v1.2.3 From 45f50bed1d808794e514e9eed0e579a8756ce2ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Jun 2016 16:41:39 -0700 Subject: net_sched: remove generic throttled management __QDISC_STATE_THROTTLED bit manipulation is rather expensive for HTB and few others. I already removed it for sch_fq in commit f2600cf02b5b ("net: sched: avoid costly atomic operation in fq_dequeue()") and so far nobody complained. When one ore more packets are stuck in one or more throttled HTB class, a htb dequeue() performs two atomic operations to clear/set __QDISC_STATE_THROTTLED bit, while root qdisc lock is held. Removing this pair of atomic operations bring me a 8 % performance increase on 200 TCP_RR tests, in presence of throttled classes. This patch has no side effect, since nothing actually uses disc_is_throttled() anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_api.c | 7 +------ net/sched/sch_cbq.c | 2 -- net/sched/sch_fq.c | 3 +-- net/sched/sch_hfsc.c | 1 - net/sched/sch_htb.c | 3 +-- net/sched/sch_netem.c | 1 - net/sched/sch_tbf.c | 4 +--- 7 files changed, 4 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d4a8bbfcc953..401eda6de682 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -583,7 +583,6 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) timer); rcu_read_lock(); - qdisc_unthrottled(wd->qdisc); __netif_schedule(qdisc_root(wd->qdisc)); rcu_read_unlock(); @@ -598,15 +597,12 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle) +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; - if (throttle) - qdisc_throttled(wd->qdisc); - if (wd->last_expires == expires) return; @@ -620,7 +616,6 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { hrtimer_cancel(&wd->timer); - qdisc_unthrottled(wd->qdisc); } EXPORT_SYMBOL(qdisc_watchdog_cancel); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 6e61f9aa8783..a29fd811d7b9 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -513,7 +513,6 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED); } - qdisc_unthrottled(sch); __netif_schedule(qdisc_root(sch)); return HRTIMER_NORESTART; } @@ -819,7 +818,6 @@ cbq_dequeue(struct Qdisc *sch) if (skb) { qdisc_bstats_update(sch, skb); sch->q.qlen--; - qdisc_unthrottled(sch); return skb; } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 3c6a47d66a04..f49c81e91acd 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -445,8 +445,7 @@ begin: if (!head->first) { if (q->time_next_delayed_flow != ~0ULL) qdisc_watchdog_schedule_ns(&q->watchdog, - q->time_next_delayed_flow, - false); + q->time_next_delayed_flow); return NULL; } } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index eb3d3f5aba80..bd08c363a26d 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1664,7 +1664,6 @@ hfsc_dequeue(struct Qdisc *sch) set_passive(cl); } - qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index b74d06668ab4..07dcd2933f01 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -889,7 +889,6 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) if (skb != NULL) { ok: qdisc_bstats_update(sch, skb); - qdisc_unthrottled(sch); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; @@ -929,7 +928,7 @@ ok: } qdisc_qstats_overlimit(sch); if (likely(next_event > q->now)) - qdisc_watchdog_schedule_ns(&q->watchdog, next_event, true); + qdisc_watchdog_schedule_ns(&q->watchdog, next_event); else schedule_work(&q->work); fin: diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 2dbe732ca135..876df13c745a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -587,7 +587,6 @@ tfifo_dequeue: if (skb) { qdisc_qstats_backlog_dec(sch, skb); deliver: - qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 7fa3d6e1291c..c12df84d1078 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -254,14 +254,12 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) q->ptokens = ptoks; qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; - qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; } qdisc_watchdog_schedule_ns(&q->watchdog, - now + max_t(long, -toks, -ptoks), - true); + now + max_t(long, -toks, -ptoks)); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, -- cgit v1.2.3 From 38b7097b55b6cf30adc5ac07cb1055683224393e Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 11 Jun 2016 20:08:19 +0200 Subject: ipv6: use TOS marks from sockets for routing decision In IPv6 the ToS values are part of the flowlabel in flowi6 and get extracted during fib rule lookup, but we forgot to correctly initialize the flowlabel before the routing lookup. Reported-by: Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 4 +++- net/ipv6/ping.c | 4 +++- net/ipv6/raw.c | 8 +++++--- net/ipv6/route.c | 2 ++ net/ipv6/udp.c | 8 +++++--- net/l2tp/l2tp_ip6.c | 8 +++++--- 6 files changed, 23 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 4527285fcaa2..40454bfb534e 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -502,12 +502,14 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; + ipc6.tclass = np->tclass; + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); + dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) goto out; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.tclass = np->tclass; ipc6.dontfrag = np->dontfrag; ipc6.opt = NULL; diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 3ee3e444a66b..fed40d1ec29b 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -116,6 +116,9 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; + ipc6.tclass = np->tclass; + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); + dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -140,7 +143,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) pfh.family = AF_INET6; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.tclass = np->tclass; ipc6.dontfrag = np->dontfrag; ipc6.opt = NULL; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 896350df6423..590dd1f7746f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -878,6 +878,11 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (inet->hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; + + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); @@ -886,9 +891,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - if (ipc6.dontfrag < 0) ipc6.dontfrag = np->dontfrag; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 969913da494f..c6ae6f9b5fe3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3306,6 +3306,8 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) err = -EINVAL; memset(&fl6, 0, sizeof(fl6)); + rtm = nlmsg_data(nlh); + fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); if (tb[RTA_SRC]) { if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f421c9f23c5b..4bb5c13777f1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1246,6 +1246,11 @@ do_udp_sendmsg: security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; + + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); + dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); @@ -1256,9 +1261,6 @@ do_udp_sendmsg: if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 6c54e03fe9c1..ea2ae6664cc8 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -611,6 +611,11 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; + + fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); @@ -620,9 +625,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - if (ipc6.dontfrag < 0) ipc6.dontfrag = np->dontfrag; -- cgit v1.2.3 From 99860208bc62d8ebd5c57495b84856506fe075bc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 11 Jun 2016 12:46:04 +0200 Subject: sched: remove NET_XMIT_POLICED sch_atm returns this when TC_ACT_SHOT classification occurs. But all other schedulers that use tc_classify (htb, hfsc, drr, fq_codel ...) return NET_XMIT_SUCCESS | __BYPASS in this case so just do that in atm. BATMAN uses it as an intermediate return value to signal forwarding vs. buffering, but it did not return POLICED to callers outside of BATMAN. Reviewed-by: Sven Eckelmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/batman-adv/routing.c | 2 +- net/batman-adv/send.c | 4 ++-- net/core/pktgen.c | 1 - net/sched/sch_api.c | 2 -- net/sched/sch_atm.c | 2 +- 5 files changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index e3857ed4057f..f75091c983ee 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -653,7 +653,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, len + ETH_HLEN); ret = NET_RX_SUCCESS; - } else if (res == NET_XMIT_POLICED) { + } else if (res == -EINPROGRESS) { /* skb was buffered and consumed */ ret = NET_RX_SUCCESS; } diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index f2f125684ed9..b1a4e8a811c8 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -156,7 +156,7 @@ int batadv_send_unicast_skb(struct sk_buff *skb, * attempted. * * Return: NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or - * NET_XMIT_POLICED if the skb is buffered for later transmit. + * -EINPROGRESS if the skb is buffered for later transmit. */ int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, @@ -188,7 +188,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, * network coding fails, then send the packet as usual. */ if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) { - ret = NET_XMIT_POLICED; + ret = -EINPROGRESS; } else { batadv_send_unicast_skb(skb, neigh_node); ret = NET_XMIT_SUCCESS; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8b02df0d354d..f74ab9c3b38f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3463,7 +3463,6 @@ xmit_more: break; case NET_XMIT_DROP: case NET_XMIT_CN: - case NET_XMIT_POLICED: /* skb has been consumed */ pkt_dev->errors++; break; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 401eda6de682..12ebde845523 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -95,8 +95,6 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, Expected action: do not backoff, but wait until queue will clear. NET_XMIT_CN - probably this packet enqueued, but another one dropped. Expected action: backoff or ignore - NET_XMIT_POLICED - dropped by police. - Expected action: backoff or error to real-time apps. Auxiliary routines: diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 7e6c12dfc66a..0785b239ddf9 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -363,7 +363,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct atm_flow_data *flow; struct tcf_result res; int result; - int ret = NET_XMIT_POLICED; + int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); result = TC_POLICE_OK; /* be nice to gcc */ -- cgit v1.2.3 From 8c3e34a4ff85142ca5dba3f18cbc2061899e2612 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 13 Jun 2016 12:16:05 +0100 Subject: rxrpc: Rename files matching ar-*.c to git rid of the "ar-" prefix Rename files matching net/rxrpc/ar-*.c to get rid of the "ar-" prefix. This will aid splitting those files by making easier to come up with new names. Note that the not all files are simply renamed from ar-X.c to X.c. The following exceptions are made: (*) ar-call.c -> call_object.c ar-ack.c -> call_event.c call_object.c is going to contain the core of the call object handling. Call event handling is all going to be in call_event.c. (*) ar-accept.c -> call_accept.c Incoming call handling is going to be here. (*) ar-connection.c -> conn_object.c ar-connevent.c -> conn_event.c The former file is going to have the basic connection object handling, but there will likely be some differentiation between client connections and service connections in additional files later. The latter file will have all the connection-level event handling. (*) ar-local.c -> local_object.c This will have the local endpoint object handling code. The local endpoint event handling code will later be split out into local_event.c. (*) ar-peer.c -> peer_object.c This will have the peer endpoint object handling code. Peer event handling code will be placed in peer_event.c (for the moment, there is none). (*) ar-error.c -> peer_event.c This will become the peer event handling code, though for the moment it's actually driven from the local endpoint's perspective. Note that I haven't renamed ar-transport.c to transport_object.c as the intention is to delete it when the rxrpc_transport struct is excised. The only file that actually has its contents changed is net/rxrpc/Makefile. net/rxrpc/ar-internal.h will need its section marker comments updating, but I'll do that in a separate patch to make it easier for git to follow the history across the rename. I may also want to rename ar-internal.h at some point - but that would mean updating all the #includes and I'd rather do that in a separate step. Signed-off-by: David Howells -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * generate a connection-level abort - */ -static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, - struct rxrpc_wire_header *whdr) -{ - struct msghdr msg; - struct kvec iov[1]; - size_t len; - int ret; - - _enter("%d,,", local->debug_id); - - whdr->type = RXRPC_PACKET_TYPE_BUSY; - whdr->serial = htonl(1); - - msg.msg_name = &srx->transport.sin; - msg.msg_namelen = sizeof(srx->transport.sin); - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - iov[0].iov_base = whdr; - iov[0].iov_len = sizeof(*whdr); - - len = iov[0].iov_len; - - _proto("Tx BUSY %%1"); - - ret = kernel_sendmsg(local->socket, &msg, iov, 1, len); - if (ret < 0) { - _leave(" = -EAGAIN [sendmsg failed: %d]", ret); - return -EAGAIN; - } - - _leave(" = 0"); - return 0; -} - -/* - * accept an incoming call that needs peer, transport and/or connection setting - * up - */ -static int rxrpc_accept_incoming_call(struct rxrpc_local *local, - struct rxrpc_sock *rx, - struct sk_buff *skb, - struct sockaddr_rxrpc *srx) -{ - struct rxrpc_connection *conn; - struct rxrpc_transport *trans; - struct rxrpc_skb_priv *sp, *nsp; - struct rxrpc_peer *peer; - struct rxrpc_call *call; - struct sk_buff *notification; - int ret; - - _enter(""); - - sp = rxrpc_skb(skb); - - /* get a notification message to send to the server app */ - notification = alloc_skb(0, GFP_NOFS); - if (!notification) { - _debug("no memory"); - ret = -ENOMEM; - goto error_nofree; - } - rxrpc_new_skb(notification); - notification->mark = RXRPC_SKB_MARK_NEW_CALL; - - peer = rxrpc_get_peer(srx, GFP_NOIO); - if (IS_ERR(peer)) { - _debug("no peer"); - ret = -EBUSY; - goto error; - } - - trans = rxrpc_get_transport(local, peer, GFP_NOIO); - rxrpc_put_peer(peer); - if (IS_ERR(trans)) { - _debug("no trans"); - ret = -EBUSY; - goto error; - } - - conn = rxrpc_incoming_connection(trans, &sp->hdr); - rxrpc_put_transport(trans); - if (IS_ERR(conn)) { - _debug("no conn"); - ret = PTR_ERR(conn); - goto error; - } - - call = rxrpc_incoming_call(rx, conn, &sp->hdr); - rxrpc_put_connection(conn); - if (IS_ERR(call)) { - _debug("no call"); - ret = PTR_ERR(call); - goto error; - } - - /* attach the call to the socket */ - read_lock_bh(&local->services_lock); - if (rx->sk.sk_state == RXRPC_CLOSE) - goto invalid_service; - - write_lock(&rx->call_lock); - if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) { - rxrpc_get_call(call); - - spin_lock(&call->conn->state_lock); - if (sp->hdr.securityIndex > 0 && - call->conn->state == RXRPC_CONN_SERVER_UNSECURED) { - _debug("await conn sec"); - list_add_tail(&call->accept_link, &rx->secureq); - call->conn->state = RXRPC_CONN_SERVER_CHALLENGING; - atomic_inc(&call->conn->usage); - set_bit(RXRPC_CONN_CHALLENGE, &call->conn->events); - rxrpc_queue_conn(call->conn); - } else { - _debug("conn ready"); - call->state = RXRPC_CALL_SERVER_ACCEPTING; - list_add_tail(&call->accept_link, &rx->acceptq); - rxrpc_get_call(call); - nsp = rxrpc_skb(notification); - nsp->call = call; - - ASSERTCMP(atomic_read(&call->usage), >=, 3); - - _debug("notify"); - spin_lock(&call->lock); - ret = rxrpc_queue_rcv_skb(call, notification, true, - false); - spin_unlock(&call->lock); - notification = NULL; - BUG_ON(ret < 0); - } - spin_unlock(&call->conn->state_lock); - - _debug("queued"); - } - write_unlock(&rx->call_lock); - - _debug("process"); - rxrpc_fast_process_packet(call, skb); - - _debug("done"); - read_unlock_bh(&local->services_lock); - rxrpc_free_skb(notification); - rxrpc_put_call(call); - _leave(" = 0"); - return 0; - -invalid_service: - _debug("invalid"); - read_unlock_bh(&local->services_lock); - - read_lock_bh(&call->state_lock); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { - rxrpc_get_call(call); - rxrpc_queue_call(call); - } - read_unlock_bh(&call->state_lock); - rxrpc_put_call(call); - ret = -ECONNREFUSED; -error: - rxrpc_free_skb(notification); -error_nofree: - _leave(" = %d", ret); - return ret; -} - -/* - * accept incoming calls that need peer, transport and/or connection setting up - * - the packets we get are all incoming client DATA packets that have seq == 1 - */ -void rxrpc_accept_incoming_calls(struct work_struct *work) -{ - struct rxrpc_local *local = - container_of(work, struct rxrpc_local, acceptor); - struct rxrpc_skb_priv *sp; - struct sockaddr_rxrpc srx; - struct rxrpc_sock *rx; - struct rxrpc_wire_header whdr; - struct sk_buff *skb; - int ret; - - _enter("%d", local->debug_id); - - read_lock_bh(&rxrpc_local_lock); - if (atomic_read(&local->usage) > 0) - rxrpc_get_local(local); - else - local = NULL; - read_unlock_bh(&rxrpc_local_lock); - if (!local) { - _leave(" [local dead]"); - return; - } - -process_next_packet: - skb = skb_dequeue(&local->accept_queue); - if (!skb) { - rxrpc_put_local(local); - _leave("\n"); - return; - } - - _net("incoming call skb %p", skb); - - sp = rxrpc_skb(skb); - - /* Set up a response packet header in case we need it */ - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.seq = htonl(sp->hdr.seq); - whdr.serial = 0; - whdr.flags = 0; - whdr.type = 0; - whdr.userStatus = 0; - whdr.securityIndex = sp->hdr.securityIndex; - whdr._rsvd = 0; - whdr.serviceId = htons(sp->hdr.serviceId); - - /* determine the remote address */ - memset(&srx, 0, sizeof(srx)); - srx.srx_family = AF_RXRPC; - srx.transport.family = local->srx.transport.family; - srx.transport_type = local->srx.transport_type; - switch (srx.transport.family) { - case AF_INET: - srx.transport_len = sizeof(struct sockaddr_in); - srx.transport.sin.sin_port = udp_hdr(skb)->source; - srx.transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; - break; - default: - goto busy; - } - - /* get the socket providing the service */ - read_lock_bh(&local->services_lock); - list_for_each_entry(rx, &local->services, listen_link) { - if (rx->srx.srx_service == sp->hdr.serviceId && - rx->sk.sk_state != RXRPC_CLOSE) - goto found_service; - } - read_unlock_bh(&local->services_lock); - goto invalid_service; - -found_service: - _debug("found service %hd", rx->srx.srx_service); - if (sk_acceptq_is_full(&rx->sk)) - goto backlog_full; - sk_acceptq_added(&rx->sk); - sock_hold(&rx->sk); - read_unlock_bh(&local->services_lock); - - ret = rxrpc_accept_incoming_call(local, rx, skb, &srx); - if (ret < 0) - sk_acceptq_removed(&rx->sk); - sock_put(&rx->sk); - switch (ret) { - case -ECONNRESET: /* old calls are ignored */ - case -ECONNABORTED: /* aborted calls are reaborted or ignored */ - case 0: - goto process_next_packet; - case -ECONNREFUSED: - goto invalid_service; - case -EBUSY: - goto busy; - case -EKEYREJECTED: - goto security_mismatch; - default: - BUG(); - } - -backlog_full: - read_unlock_bh(&local->services_lock); -busy: - rxrpc_busy(local, &srx, &whdr); - rxrpc_free_skb(skb); - goto process_next_packet; - -invalid_service: - skb->priority = RX_INVALID_OPERATION; - rxrpc_reject_packet(local, skb); - goto process_next_packet; - - /* can't change connection security type mid-flow */ -security_mismatch: - skb->priority = RX_PROTOCOL_ERROR; - rxrpc_reject_packet(local, skb); - goto process_next_packet; -} - -/* - * handle acceptance of a call by userspace - * - assign the user call ID to the call at the front of the queue - */ -struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, - unsigned long user_call_ID) -{ - struct rxrpc_call *call; - struct rb_node *parent, **pp; - int ret; - - _enter(",%lx", user_call_ID); - - ASSERT(!irqs_disabled()); - - write_lock(&rx->call_lock); - - ret = -ENODATA; - if (list_empty(&rx->acceptq)) - goto out; - - /* check the user ID isn't already in use */ - ret = -EBADSLT; - pp = &rx->calls.rb_node; - parent = NULL; - while (*pp) { - parent = *pp; - call = rb_entry(parent, struct rxrpc_call, sock_node); - - if (user_call_ID < call->user_call_ID) - pp = &(*pp)->rb_left; - else if (user_call_ID > call->user_call_ID) - pp = &(*pp)->rb_right; - else - goto out; - } - - /* dequeue the first call and check it's still valid */ - call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); - list_del_init(&call->accept_link); - sk_acceptq_removed(&rx->sk); - - write_lock_bh(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_SERVER_ACCEPTING: - call->state = RXRPC_CALL_SERVER_RECV_REQUEST; - break; - case RXRPC_CALL_REMOTELY_ABORTED: - case RXRPC_CALL_LOCALLY_ABORTED: - ret = -ECONNABORTED; - goto out_release; - case RXRPC_CALL_NETWORK_ERROR: - ret = call->conn->error; - goto out_release; - case RXRPC_CALL_DEAD: - ret = -ETIME; - goto out_discard; - default: - BUG(); - } - - /* formalise the acceptance */ - call->user_call_ID = user_call_ID; - rb_link_node(&call->sock_node, parent, pp); - rb_insert_color(&call->sock_node, &rx->calls); - if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags)) - BUG(); - if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) - BUG(); - rxrpc_queue_call(call); - - rxrpc_get_call(call); - write_unlock_bh(&call->state_lock); - write_unlock(&rx->call_lock); - _leave(" = %p{%d}", call, call->debug_id); - return call; - - /* if the call is already dying or dead, then we leave the socket's ref - * on it to be released by rxrpc_dead_call_expired() as induced by - * rxrpc_release_call() */ -out_release: - _debug("release %p", call); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) - rxrpc_queue_call(call); -out_discard: - write_unlock_bh(&call->state_lock); - _debug("discard %p", call); -out: - write_unlock(&rx->call_lock); - _leave(" = %d", ret); - return ERR_PTR(ret); -} - -/* - * Handle rejection of a call by userspace - * - reject the call at the front of the queue - */ -int rxrpc_reject_call(struct rxrpc_sock *rx) -{ - struct rxrpc_call *call; - int ret; - - _enter(""); - - ASSERT(!irqs_disabled()); - - write_lock(&rx->call_lock); - - ret = -ENODATA; - if (list_empty(&rx->acceptq)) - goto out; - - /* dequeue the first call and check it's still valid */ - call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); - list_del_init(&call->accept_link); - sk_acceptq_removed(&rx->sk); - - write_lock_bh(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_SERVER_ACCEPTING: - call->state = RXRPC_CALL_SERVER_BUSY; - if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) - rxrpc_queue_call(call); - ret = 0; - goto out_release; - case RXRPC_CALL_REMOTELY_ABORTED: - case RXRPC_CALL_LOCALLY_ABORTED: - ret = -ECONNABORTED; - goto out_release; - case RXRPC_CALL_NETWORK_ERROR: - ret = call->conn->error; - goto out_release; - case RXRPC_CALL_DEAD: - ret = -ETIME; - goto out_discard; - default: - BUG(); - } - - /* if the call is already dying or dead, then we leave the socket's ref - * on it to be released by rxrpc_dead_call_expired() as induced by - * rxrpc_release_call() */ -out_release: - _debug("release %p", call); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) - rxrpc_queue_call(call); -out_discard: - write_unlock_bh(&call->state_lock); - _debug("discard %p", call); -out: - write_unlock(&rx->call_lock); - _leave(" = %d", ret); - return ret; -} - -/** - * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call - * @sock: The socket on which the impending call is waiting - * @user_call_ID: The tag to attach to the call - * - * Allow a kernel service to accept an incoming call, assuming the incoming - * call is still valid. - */ -struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock, - unsigned long user_call_ID) -{ - struct rxrpc_call *call; - - _enter(",%lx", user_call_ID); - call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID); - _leave(" = %p", call); - return call; -} -EXPORT_SYMBOL(rxrpc_kernel_accept_call); - -/** - * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call - * @sock: The socket on which the impending call is waiting - * - * Allow a kernel service to reject an incoming call with a BUSY message, - * assuming the incoming call is still valid. - */ -int rxrpc_kernel_reject_call(struct socket *sock) -{ - int ret; - - _enter(""); - ret = rxrpc_reject_call(rxrpc_sk(sock->sk)); - _leave(" = %d", ret); - return ret; -} -EXPORT_SYMBOL(rxrpc_kernel_reject_call); diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c deleted file mode 100644 index 18381783c2b1..000000000000 --- a/net/rxrpc/ar-ack.c +++ /dev/null @@ -1,1288 +0,0 @@ -/* Management of Tx window, Tx resend, ACKs and out-of-sequence reception - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * propose an ACK be sent - */ -void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u32 serial, bool immediate) -{ - unsigned long expiry; - s8 prior = rxrpc_ack_priority[ack_reason]; - - ASSERTCMP(prior, >, 0); - - _enter("{%d},%s,%%%x,%u", - call->debug_id, rxrpc_acks(ack_reason), serial, immediate); - - if (prior < rxrpc_ack_priority[call->ackr_reason]) { - if (immediate) - goto cancel_timer; - return; - } - - /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial - * numbers */ - if (prior == rxrpc_ack_priority[call->ackr_reason]) { - if (prior <= 4) - call->ackr_serial = serial; - if (immediate) - goto cancel_timer; - return; - } - - call->ackr_reason = ack_reason; - call->ackr_serial = serial; - - switch (ack_reason) { - case RXRPC_ACK_DELAY: - _debug("run delay timer"); - expiry = rxrpc_soft_ack_delay; - goto run_timer; - - case RXRPC_ACK_IDLE: - if (!immediate) { - _debug("run defer timer"); - expiry = rxrpc_idle_ack_delay; - goto run_timer; - } - goto cancel_timer; - - case RXRPC_ACK_REQUESTED: - expiry = rxrpc_requested_ack_delay; - if (!expiry) - goto cancel_timer; - if (!immediate || serial == 1) { - _debug("run defer timer"); - goto run_timer; - } - - default: - _debug("immediate ACK"); - goto cancel_timer; - } - -run_timer: - expiry += jiffies; - if (!timer_pending(&call->ack_timer) || - time_after(call->ack_timer.expires, expiry)) - mod_timer(&call->ack_timer, expiry); - return; - -cancel_timer: - _debug("cancel timer %%%u", serial); - try_to_del_timer_sync(&call->ack_timer); - read_lock_bh(&call->state_lock); - if (call->state <= RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); -} - -/* - * propose an ACK be sent, locking the call structure - */ -void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u32 serial, bool immediate) -{ - s8 prior = rxrpc_ack_priority[ack_reason]; - - if (prior > rxrpc_ack_priority[call->ackr_reason]) { - spin_lock_bh(&call->lock); - __rxrpc_propose_ACK(call, ack_reason, serial, immediate); - spin_unlock_bh(&call->lock); - } -} - -/* - * set the resend timer - */ -static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, - unsigned long resend_at) -{ - read_lock_bh(&call->state_lock); - if (call->state >= RXRPC_CALL_COMPLETE) - resend = 0; - - if (resend & 1) { - _debug("SET RESEND"); - set_bit(RXRPC_CALL_EV_RESEND, &call->events); - } - - if (resend & 2) { - _debug("MODIFY RESEND TIMER"); - set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - mod_timer(&call->resend_timer, resend_at); - } else { - _debug("KILL RESEND TIMER"); - del_timer_sync(&call->resend_timer); - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - } - read_unlock_bh(&call->state_lock); -} - -/* - * resend packets - */ -static void rxrpc_resend(struct rxrpc_call *call) -{ - struct rxrpc_wire_header *whdr; - struct rxrpc_skb_priv *sp; - struct sk_buff *txb; - unsigned long *p_txb, resend_at; - bool stop; - int loop; - u8 resend; - - _enter("{%d,%d,%d,%d},", - call->acks_hard, call->acks_unacked, - atomic_read(&call->sequence), - CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); - - stop = false; - resend = 0; - resend_at = 0; - - for (loop = call->acks_tail; - loop != call->acks_head || stop; - loop = (loop + 1) & (call->acks_winsz - 1) - ) { - p_txb = call->acks_window + loop; - smp_read_barrier_depends(); - if (*p_txb & 1) - continue; - - txb = (struct sk_buff *) *p_txb; - sp = rxrpc_skb(txb); - - if (sp->need_resend) { - sp->need_resend = false; - - /* each Tx packet has a new serial number */ - sp->hdr.serial = atomic_inc_return(&call->conn->serial); - - whdr = (struct rxrpc_wire_header *)txb->head; - whdr->serial = htonl(sp->hdr.serial); - - _proto("Tx DATA %%%u { #%d }", - sp->hdr.serial, sp->hdr.seq); - if (rxrpc_send_packet(call->conn->trans, txb) < 0) { - stop = true; - sp->resend_at = jiffies + 3; - } else { - sp->resend_at = - jiffies + rxrpc_resend_timeout; - } - } - - if (time_after_eq(jiffies + 1, sp->resend_at)) { - sp->need_resend = true; - resend |= 1; - } else if (resend & 2) { - if (time_before(sp->resend_at, resend_at)) - resend_at = sp->resend_at; - } else { - resend_at = sp->resend_at; - resend |= 2; - } - } - - rxrpc_set_resend(call, resend, resend_at); - _leave(""); -} - -/* - * handle resend timer expiry - */ -static void rxrpc_resend_timer(struct rxrpc_call *call) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *txb; - unsigned long *p_txb, resend_at; - int loop; - u8 resend; - - _enter("%d,%d,%d", - call->acks_tail, call->acks_unacked, call->acks_head); - - if (call->state >= RXRPC_CALL_COMPLETE) - return; - - resend = 0; - resend_at = 0; - - for (loop = call->acks_unacked; - loop != call->acks_head; - loop = (loop + 1) & (call->acks_winsz - 1) - ) { - p_txb = call->acks_window + loop; - smp_read_barrier_depends(); - txb = (struct sk_buff *) (*p_txb & ~1); - sp = rxrpc_skb(txb); - - ASSERT(!(*p_txb & 1)); - - if (sp->need_resend) { - ; - } else if (time_after_eq(jiffies + 1, sp->resend_at)) { - sp->need_resend = true; - resend |= 1; - } else if (resend & 2) { - if (time_before(sp->resend_at, resend_at)) - resend_at = sp->resend_at; - } else { - resend_at = sp->resend_at; - resend |= 2; - } - } - - rxrpc_set_resend(call, resend, resend_at); - _leave(""); -} - -/* - * process soft ACKs of our transmitted packets - * - these indicate packets the peer has or has not received, but hasn't yet - * given to the consumer, and so can still be discarded and re-requested - */ -static int rxrpc_process_soft_ACKs(struct rxrpc_call *call, - struct rxrpc_ackpacket *ack, - struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *txb; - unsigned long *p_txb, resend_at; - int loop; - u8 sacks[RXRPC_MAXACKS], resend; - - _enter("{%d,%d},{%d},", - call->acks_hard, - CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz), - ack->nAcks); - - if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0) - goto protocol_error; - - resend = 0; - resend_at = 0; - for (loop = 0; loop < ack->nAcks; loop++) { - p_txb = call->acks_window; - p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1); - smp_read_barrier_depends(); - txb = (struct sk_buff *) (*p_txb & ~1); - sp = rxrpc_skb(txb); - - switch (sacks[loop]) { - case RXRPC_ACK_TYPE_ACK: - sp->need_resend = false; - *p_txb |= 1; - break; - case RXRPC_ACK_TYPE_NACK: - sp->need_resend = true; - *p_txb &= ~1; - resend = 1; - break; - default: - _debug("Unsupported ACK type %d", sacks[loop]); - goto protocol_error; - } - } - - smp_mb(); - call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1); - - /* anything not explicitly ACK'd is implicitly NACK'd, but may just not - * have been received or processed yet by the far end */ - for (loop = call->acks_unacked; - loop != call->acks_head; - loop = (loop + 1) & (call->acks_winsz - 1) - ) { - p_txb = call->acks_window + loop; - smp_read_barrier_depends(); - txb = (struct sk_buff *) (*p_txb & ~1); - sp = rxrpc_skb(txb); - - if (*p_txb & 1) { - /* packet must have been discarded */ - sp->need_resend = true; - *p_txb &= ~1; - resend |= 1; - } else if (sp->need_resend) { - ; - } else if (time_after_eq(jiffies + 1, sp->resend_at)) { - sp->need_resend = true; - resend |= 1; - } else if (resend & 2) { - if (time_before(sp->resend_at, resend_at)) - resend_at = sp->resend_at; - } else { - resend_at = sp->resend_at; - resend |= 2; - } - } - - rxrpc_set_resend(call, resend, resend_at); - _leave(" = 0"); - return 0; - -protocol_error: - _leave(" = -EPROTO"); - return -EPROTO; -} - -/* - * discard hard-ACK'd packets from the Tx window - */ -static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard) -{ - unsigned long _skb; - int tail = call->acks_tail, old_tail; - int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz); - - _enter("{%u,%u},%u", call->acks_hard, win, hard); - - ASSERTCMP(hard - call->acks_hard, <=, win); - - while (call->acks_hard < hard) { - smp_read_barrier_depends(); - _skb = call->acks_window[tail] & ~1; - rxrpc_free_skb((struct sk_buff *) _skb); - old_tail = tail; - tail = (tail + 1) & (call->acks_winsz - 1); - call->acks_tail = tail; - if (call->acks_unacked == old_tail) - call->acks_unacked = tail; - call->acks_hard++; - } - - wake_up(&call->tx_waitq); -} - -/* - * clear the Tx window in the event of a failure - */ -static void rxrpc_clear_tx_window(struct rxrpc_call *call) -{ - rxrpc_rotate_tx_window(call, atomic_read(&call->sequence)); -} - -/* - * drain the out of sequence received packet queue into the packet Rx queue - */ -static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - bool terminal; - int ret; - - _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos); - - spin_lock_bh(&call->lock); - - ret = -ECONNRESET; - if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) - goto socket_unavailable; - - skb = skb_dequeue(&call->rx_oos_queue); - if (skb) { - sp = rxrpc_skb(skb); - - _debug("drain OOS packet %d [%d]", - sp->hdr.seq, call->rx_first_oos); - - if (sp->hdr.seq != call->rx_first_oos) { - skb_queue_head(&call->rx_oos_queue, skb); - call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; - _debug("requeue %p {%u}", skb, call->rx_first_oos); - } else { - skb->mark = RXRPC_SKB_MARK_DATA; - terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && - !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); - ret = rxrpc_queue_rcv_skb(call, skb, true, terminal); - BUG_ON(ret < 0); - _debug("drain #%u", call->rx_data_post); - call->rx_data_post++; - - /* find out what the next packet is */ - skb = skb_peek(&call->rx_oos_queue); - if (skb) - call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; - else - call->rx_first_oos = 0; - _debug("peek %p {%u}", skb, call->rx_first_oos); - } - } - - ret = 0; -socket_unavailable: - spin_unlock_bh(&call->lock); - _leave(" = %d", ret); - return ret; -} - -/* - * insert an out of sequence packet into the buffer - */ -static void rxrpc_insert_oos_packet(struct rxrpc_call *call, - struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp, *psp; - struct sk_buff *p; - u32 seq; - - sp = rxrpc_skb(skb); - seq = sp->hdr.seq; - _enter(",,{%u}", seq); - - skb->destructor = rxrpc_packet_destructor; - ASSERTCMP(sp->call, ==, NULL); - sp->call = call; - rxrpc_get_call(call); - - /* insert into the buffer in sequence order */ - spin_lock_bh(&call->lock); - - skb_queue_walk(&call->rx_oos_queue, p) { - psp = rxrpc_skb(p); - if (psp->hdr.seq > seq) { - _debug("insert oos #%u before #%u", seq, psp->hdr.seq); - skb_insert(p, skb, &call->rx_oos_queue); - goto inserted; - } - } - - _debug("append oos #%u", seq); - skb_queue_tail(&call->rx_oos_queue, skb); -inserted: - - /* we might now have a new front to the queue */ - if (call->rx_first_oos == 0 || seq < call->rx_first_oos) - call->rx_first_oos = seq; - - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - call->rx_data_post == call->rx_first_oos) { - _debug("drain rx oos now"); - set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); - } - read_unlock(&call->state_lock); - - spin_unlock_bh(&call->lock); - _leave(" [stored #%u]", call->rx_first_oos); -} - -/* - * clear the Tx window on final ACK reception - */ -static void rxrpc_zap_tx_window(struct rxrpc_call *call) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - unsigned long _skb, *acks_window; - u8 winsz = call->acks_winsz; - int tail; - - acks_window = call->acks_window; - call->acks_window = NULL; - - while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) { - tail = call->acks_tail; - smp_read_barrier_depends(); - _skb = acks_window[tail] & ~1; - smp_mb(); - call->acks_tail = (call->acks_tail + 1) & (winsz - 1); - - skb = (struct sk_buff *) _skb; - sp = rxrpc_skb(skb); - _debug("+++ clear Tx %u", sp->hdr.seq); - rxrpc_free_skb(skb); - } - - kfree(acks_window); -} - -/* - * process the extra information that may be appended to an ACK packet - */ -static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, - unsigned int latest, int nAcks) -{ - struct rxrpc_ackinfo ackinfo; - struct rxrpc_peer *peer; - unsigned int mtu; - - if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) { - _leave(" [no ackinfo]"); - return; - } - - _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", - latest, - ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU), - ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max)); - - mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU)); - - peer = call->conn->trans->peer; - if (mtu < peer->maxdata) { - spin_lock_bh(&peer->lock); - peer->maxdata = mtu; - peer->mtu = mtu + peer->hdrsize; - spin_unlock_bh(&peer->lock); - _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); - } -} - -/* - * process packets in the reception queue - */ -static int rxrpc_process_rx_queue(struct rxrpc_call *call, - u32 *_abort_code) -{ - struct rxrpc_ackpacket ack; - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - bool post_ACK; - int latest; - u32 hard, tx; - - _enter(""); - -process_further: - skb = skb_dequeue(&call->rx_queue); - if (!skb) - return -EAGAIN; - - _net("deferred skb %p", skb); - - sp = rxrpc_skb(skb); - - _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state); - - post_ACK = false; - - switch (sp->hdr.type) { - /* data packets that wind up here have been received out of - * order, need security processing or are jumbo packets */ - case RXRPC_PACKET_TYPE_DATA: - _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); - - /* secured packets must be verified and possibly decrypted */ - if (call->conn->security->verify_packet(call, skb, - _abort_code) < 0) - goto protocol_error; - - rxrpc_insert_oos_packet(call, skb); - goto process_further; - - /* partial ACK to process */ - case RXRPC_PACKET_TYPE_ACK: - if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) { - _debug("extraction failure"); - goto protocol_error; - } - if (!skb_pull(skb, sizeof(ack))) - BUG(); - - latest = sp->hdr.serial; - hard = ntohl(ack.firstPacket); - tx = atomic_read(&call->sequence); - - _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - latest, - ntohs(ack.maxSkew), - hard, - ntohl(ack.previousPacket), - ntohl(ack.serial), - rxrpc_acks(ack.reason), - ack.nAcks); - - rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks); - - if (ack.reason == RXRPC_ACK_PING) { - _proto("Rx ACK %%%u PING Request", latest); - rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, - sp->hdr.serial, true); - } - - /* discard any out-of-order or duplicate ACKs */ - if (latest - call->acks_latest <= 0) { - _debug("discard ACK %d <= %d", - latest, call->acks_latest); - goto discard; - } - call->acks_latest = latest; - - if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && - call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY && - call->state != RXRPC_CALL_SERVER_SEND_REPLY && - call->state != RXRPC_CALL_SERVER_AWAIT_ACK) - goto discard; - - _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state); - - if (hard > 0) { - if (hard - 1 > tx) { - _debug("hard-ACK'd packet %d not transmitted" - " (%d top)", - hard - 1, tx); - goto protocol_error; - } - - if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY || - call->state == RXRPC_CALL_SERVER_AWAIT_ACK) && - hard > tx) { - call->acks_hard = tx; - goto all_acked; - } - - smp_rmb(); - rxrpc_rotate_tx_window(call, hard - 1); - } - - if (ack.nAcks > 0) { - if (hard - 1 + ack.nAcks > tx) { - _debug("soft-ACK'd packet %d+%d not" - " transmitted (%d top)", - hard - 1, ack.nAcks, tx); - goto protocol_error; - } - - if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0) - goto protocol_error; - } - goto discard; - - /* complete ACK to process */ - case RXRPC_PACKET_TYPE_ACKALL: - goto all_acked; - - /* abort and busy are handled elsewhere */ - case RXRPC_PACKET_TYPE_BUSY: - case RXRPC_PACKET_TYPE_ABORT: - BUG(); - - /* connection level events - also handled elsewhere */ - case RXRPC_PACKET_TYPE_CHALLENGE: - case RXRPC_PACKET_TYPE_RESPONSE: - case RXRPC_PACKET_TYPE_DEBUG: - BUG(); - } - - /* if we've had a hard ACK that covers all the packets we've sent, then - * that ends that phase of the operation */ -all_acked: - write_lock_bh(&call->state_lock); - _debug("ack all %d", call->state); - - switch (call->state) { - case RXRPC_CALL_CLIENT_AWAIT_REPLY: - call->state = RXRPC_CALL_CLIENT_RECV_REPLY; - break; - case RXRPC_CALL_SERVER_AWAIT_ACK: - _debug("srv complete"); - call->state = RXRPC_CALL_COMPLETE; - post_ACK = true; - break; - case RXRPC_CALL_CLIENT_SEND_REQUEST: - case RXRPC_CALL_SERVER_RECV_REQUEST: - goto protocol_error_unlock; /* can't occur yet */ - default: - write_unlock_bh(&call->state_lock); - goto discard; /* assume packet left over from earlier phase */ - } - - write_unlock_bh(&call->state_lock); - - /* if all the packets we sent are hard-ACK'd, then we can discard - * whatever we've got left */ - _debug("clear Tx %d", - CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); - - del_timer_sync(&call->resend_timer); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - - if (call->acks_window) - rxrpc_zap_tx_window(call); - - if (post_ACK) { - /* post the final ACK message for userspace to pick up */ - _debug("post ACK"); - skb->mark = RXRPC_SKB_MARK_FINAL_ACK; - sp->call = call; - rxrpc_get_call(call); - spin_lock_bh(&call->lock); - if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0) - BUG(); - spin_unlock_bh(&call->lock); - goto process_further; - } - -discard: - rxrpc_free_skb(skb); - goto process_further; - -protocol_error_unlock: - write_unlock_bh(&call->state_lock); -protocol_error: - rxrpc_free_skb(skb); - _leave(" = -EPROTO"); - return -EPROTO; -} - -/* - * post a message to the socket Rx queue for recvmsg() to pick up - */ -static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error, - bool fatal) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - int ret; - - _enter("{%d,%lx},%u,%u,%d", - call->debug_id, call->flags, mark, error, fatal); - - /* remove timers and things for fatal messages */ - if (fatal) { - del_timer_sync(&call->resend_timer); - del_timer_sync(&call->ack_timer); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - } - - if (mark != RXRPC_SKB_MARK_NEW_CALL && - !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - _leave("[no userid]"); - return 0; - } - - if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { - skb = alloc_skb(0, GFP_NOFS); - if (!skb) - return -ENOMEM; - - rxrpc_new_skb(skb); - - skb->mark = mark; - - sp = rxrpc_skb(skb); - memset(sp, 0, sizeof(*sp)); - sp->error = error; - sp->call = call; - rxrpc_get_call(call); - - spin_lock_bh(&call->lock); - ret = rxrpc_queue_rcv_skb(call, skb, true, fatal); - spin_unlock_bh(&call->lock); - BUG_ON(ret < 0); - } - - return 0; -} - -/* - * handle background processing of incoming call packets and ACK / abort - * generation - */ -void rxrpc_process_call(struct work_struct *work) -{ - struct rxrpc_call *call = - container_of(work, struct rxrpc_call, processor); - struct rxrpc_wire_header whdr; - struct rxrpc_ackpacket ack; - struct rxrpc_ackinfo ackinfo; - struct msghdr msg; - struct kvec iov[5]; - enum rxrpc_call_event genbit; - unsigned long bits; - __be32 data, pad; - size_t len; - int loop, nbit, ioc, ret, mtu; - u32 serial, abort_code = RX_PROTOCOL_ERROR; - u8 *acks = NULL; - - //printk("\n--------------------\n"); - _enter("{%d,%s,%lx} [%lu]", - call->debug_id, rxrpc_call_states[call->state], call->events, - (jiffies - call->creation_jif) / (HZ / 10)); - - if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) { - _debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX"); - return; - } - - /* there's a good chance we're going to have to send a message, so set - * one up in advance */ - msg.msg_name = &call->conn->trans->peer->srx.transport; - msg.msg_namelen = call->conn->trans->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - whdr.epoch = htonl(call->conn->epoch); - whdr.cid = htonl(call->cid); - whdr.callNumber = htonl(call->call_id); - whdr.seq = 0; - whdr.type = RXRPC_PACKET_TYPE_ACK; - whdr.flags = call->conn->out_clientflag; - whdr.userStatus = 0; - whdr.securityIndex = call->conn->security_ix; - whdr._rsvd = 0; - whdr.serviceId = htons(call->service_id); - - memset(iov, 0, sizeof(iov)); - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - - /* deal with events of a final nature */ - if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { - rxrpc_release_call(call); - clear_bit(RXRPC_CALL_EV_RELEASE, &call->events); - } - - if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) { - int error; - - clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); - clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); - clear_bit(RXRPC_CALL_EV_ABORT, &call->events); - - error = call->conn->trans->peer->net_error; - _debug("post net error %d", error); - - if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR, - error, true) < 0) - goto no_mem; - clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); - goto kill_ACKs; - } - - if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) { - ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); - - clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); - clear_bit(RXRPC_CALL_EV_ABORT, &call->events); - - _debug("post conn abort"); - - if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, - call->conn->error, true) < 0) - goto no_mem; - clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); - goto kill_ACKs; - } - - if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) { - whdr.type = RXRPC_PACKET_TYPE_BUSY; - genbit = RXRPC_CALL_EV_REJECT_BUSY; - goto send_message; - } - - if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) { - ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); - - if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, - ECONNABORTED, true) < 0) - goto no_mem; - whdr.type = RXRPC_PACKET_TYPE_ABORT; - data = htonl(call->local_abort); - iov[1].iov_base = &data; - iov[1].iov_len = sizeof(data); - genbit = RXRPC_CALL_EV_ABORT; - goto send_message; - } - - if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) { - genbit = RXRPC_CALL_EV_ACK_FINAL; - - ack.bufferSpace = htons(8); - ack.maxSkew = 0; - ack.serial = 0; - ack.reason = RXRPC_ACK_IDLE; - ack.nAcks = 0; - call->ackr_reason = 0; - - spin_lock_bh(&call->lock); - ack.serial = htonl(call->ackr_serial); - ack.previousPacket = htonl(call->ackr_prev_seq); - ack.firstPacket = htonl(call->rx_data_eaten + 1); - spin_unlock_bh(&call->lock); - - pad = 0; - - iov[1].iov_base = &ack; - iov[1].iov_len = sizeof(ack); - iov[2].iov_base = &pad; - iov[2].iov_len = 3; - iov[3].iov_base = &ackinfo; - iov[3].iov_len = sizeof(ackinfo); - goto send_ACK; - } - - if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) | - (1 << RXRPC_CALL_EV_RCVD_ABORT)) - ) { - u32 mark; - - if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events)) - mark = RXRPC_SKB_MARK_REMOTE_ABORT; - else - mark = RXRPC_SKB_MARK_BUSY; - - _debug("post abort/busy"); - rxrpc_clear_tx_window(call); - if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0) - goto no_mem; - - clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); - clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); - goto kill_ACKs; - } - - if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) { - _debug("do implicit ackall"); - rxrpc_clear_tx_window(call); - } - - if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) { - write_lock_bh(&call->state_lock); - if (call->state <= RXRPC_CALL_COMPLETE) { - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = RX_CALL_TIMEOUT; - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - } - write_unlock_bh(&call->state_lock); - - _debug("post timeout"); - if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, - ETIME, true) < 0) - goto no_mem; - - clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); - goto kill_ACKs; - } - - /* deal with assorted inbound messages */ - if (!skb_queue_empty(&call->rx_queue)) { - switch (rxrpc_process_rx_queue(call, &abort_code)) { - case 0: - case -EAGAIN: - break; - case -ENOMEM: - goto no_mem; - case -EKEYEXPIRED: - case -EKEYREJECTED: - case -EPROTO: - rxrpc_abort_call(call, abort_code); - goto kill_ACKs; - } - } - - /* handle resending */ - if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) - rxrpc_resend_timer(call); - if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) - rxrpc_resend(call); - - /* consider sending an ordinary ACK */ - if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { - _debug("send ACK: window: %d - %d { %lx }", - call->rx_data_eaten, call->ackr_win_top, - call->ackr_window[0]); - - if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST && - call->ackr_reason != RXRPC_ACK_PING_RESPONSE) { - /* ACK by sending reply DATA packet in this state */ - clear_bit(RXRPC_CALL_EV_ACK, &call->events); - goto maybe_reschedule; - } - - genbit = RXRPC_CALL_EV_ACK; - - acks = kzalloc(call->ackr_win_top - call->rx_data_eaten, - GFP_NOFS); - if (!acks) - goto no_mem; - - //hdr.flags = RXRPC_SLOW_START_OK; - ack.bufferSpace = htons(8); - ack.maxSkew = 0; - - spin_lock_bh(&call->lock); - ack.reason = call->ackr_reason; - ack.serial = htonl(call->ackr_serial); - ack.previousPacket = htonl(call->ackr_prev_seq); - ack.firstPacket = htonl(call->rx_data_eaten + 1); - - ack.nAcks = 0; - for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { - nbit = loop * BITS_PER_LONG; - for (bits = call->ackr_window[loop]; bits; bits >>= 1 - ) { - _debug("- l=%d n=%d b=%lx", loop, nbit, bits); - if (bits & 1) { - acks[nbit] = RXRPC_ACK_TYPE_ACK; - ack.nAcks = nbit + 1; - } - nbit++; - } - } - call->ackr_reason = 0; - spin_unlock_bh(&call->lock); - - pad = 0; - - iov[1].iov_base = &ack; - iov[1].iov_len = sizeof(ack); - iov[2].iov_base = acks; - iov[2].iov_len = ack.nAcks; - iov[3].iov_base = &pad; - iov[3].iov_len = 3; - iov[4].iov_base = &ackinfo; - iov[4].iov_len = sizeof(ackinfo); - - switch (ack.reason) { - case RXRPC_ACK_REQUESTED: - case RXRPC_ACK_DUPLICATE: - case RXRPC_ACK_OUT_OF_SEQUENCE: - case RXRPC_ACK_EXCEEDS_WINDOW: - case RXRPC_ACK_NOSPACE: - case RXRPC_ACK_PING: - case RXRPC_ACK_PING_RESPONSE: - goto send_ACK_with_skew; - case RXRPC_ACK_DELAY: - case RXRPC_ACK_IDLE: - goto send_ACK; - } - } - - /* handle completion of security negotiations on an incoming - * connection */ - if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) { - _debug("secured"); - spin_lock_bh(&call->lock); - - if (call->state == RXRPC_CALL_SERVER_SECURING) { - _debug("securing"); - write_lock(&call->conn->lock); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { - _debug("not released"); - call->state = RXRPC_CALL_SERVER_ACCEPTING; - list_move_tail(&call->accept_link, - &call->socket->acceptq); - } - write_unlock(&call->conn->lock); - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) - set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); - read_unlock(&call->state_lock); - } - - spin_unlock_bh(&call->lock); - if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) - goto maybe_reschedule; - } - - /* post a notification of an acceptable connection to the app */ - if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) { - _debug("post accept"); - if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL, - 0, false) < 0) - goto no_mem; - clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); - goto maybe_reschedule; - } - - /* handle incoming call acceptance */ - if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) { - _debug("accepted"); - ASSERTCMP(call->rx_data_post, ==, 0); - call->rx_data_post = 1; - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) - set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); - read_unlock_bh(&call->state_lock); - } - - /* drain the out of sequence received packet queue into the packet Rx - * queue */ - if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) { - while (call->rx_data_post == call->rx_first_oos) - if (rxrpc_drain_rx_oos_queue(call) < 0) - break; - goto maybe_reschedule; - } - - /* other events may have been raised since we started checking */ - goto maybe_reschedule; - -send_ACK_with_skew: - ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) - - ntohl(ack.serial)); -send_ACK: - mtu = call->conn->trans->peer->if_mtu; - mtu -= call->conn->trans->peer->hdrsize; - ackinfo.maxMTU = htonl(mtu); - ackinfo.rwind = htonl(rxrpc_rx_window_size); - - /* permit the peer to send us jumbo packets if it wants to */ - ackinfo.rxMTU = htonl(rxrpc_rx_mtu); - ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max); - - serial = atomic_inc_return(&call->conn->serial); - whdr.serial = htonl(serial); - _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - serial, - ntohs(ack.maxSkew), - ntohl(ack.firstPacket), - ntohl(ack.previousPacket), - ntohl(ack.serial), - rxrpc_acks(ack.reason), - ack.nAcks); - - del_timer_sync(&call->ack_timer); - if (ack.nAcks > 0) - set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags); - goto send_message_2; - -send_message: - _debug("send message"); - - serial = atomic_inc_return(&call->conn->serial); - whdr.serial = htonl(serial); - _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial); -send_message_2: - - len = iov[0].iov_len; - ioc = 1; - if (iov[4].iov_len) { - ioc = 5; - len += iov[4].iov_len; - len += iov[3].iov_len; - len += iov[2].iov_len; - len += iov[1].iov_len; - } else if (iov[3].iov_len) { - ioc = 4; - len += iov[3].iov_len; - len += iov[2].iov_len; - len += iov[1].iov_len; - } else if (iov[2].iov_len) { - ioc = 3; - len += iov[2].iov_len; - len += iov[1].iov_len; - } else if (iov[1].iov_len) { - ioc = 2; - len += iov[1].iov_len; - } - - ret = kernel_sendmsg(call->conn->trans->local->socket, - &msg, iov, ioc, len); - if (ret < 0) { - _debug("sendmsg failed: %d", ret); - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_DEAD) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); - goto error; - } - - switch (genbit) { - case RXRPC_CALL_EV_ABORT: - clear_bit(genbit, &call->events); - clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); - goto kill_ACKs; - - case RXRPC_CALL_EV_ACK_FINAL: - write_lock_bh(&call->state_lock); - if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) - call->state = RXRPC_CALL_COMPLETE; - write_unlock_bh(&call->state_lock); - goto kill_ACKs; - - default: - clear_bit(genbit, &call->events); - switch (call->state) { - case RXRPC_CALL_CLIENT_AWAIT_REPLY: - case RXRPC_CALL_CLIENT_RECV_REPLY: - case RXRPC_CALL_SERVER_RECV_REQUEST: - case RXRPC_CALL_SERVER_ACK_REQUEST: - _debug("start ACK timer"); - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, - call->ackr_serial, false); - default: - break; - } - goto maybe_reschedule; - } - -kill_ACKs: - del_timer_sync(&call->ack_timer); - if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) - rxrpc_put_call(call); - clear_bit(RXRPC_CALL_EV_ACK, &call->events); - -maybe_reschedule: - if (call->events || !skb_queue_empty(&call->rx_queue)) { - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_DEAD) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); - } - - /* don't leave aborted connections on the accept queue */ - if (call->state >= RXRPC_CALL_COMPLETE && - !list_empty(&call->accept_link)) { - _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }", - call, call->events, call->flags, call->conn->cid); - - read_lock_bh(&call->state_lock); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); - } - -error: - clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags); - kfree(acks); - - /* because we don't want two CPUs both processing the work item for one - * call at the same time, we use a flag to note when it's busy; however - * this means there's a race between clearing the flag and setting the - * work pending bit and the work item being processed again */ - if (call->events && !work_pending(&call->processor)) { - _debug("jumpstart %x", call->conn->cid); - rxrpc_queue_call(call); - } - - _leave(""); - return; - -no_mem: - _debug("out of memory"); - goto maybe_reschedule; -} diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c deleted file mode 100644 index 68125dc4cb7c..000000000000 --- a/net/rxrpc/ar-call.c +++ /dev/null @@ -1,980 +0,0 @@ -/* RxRPC individual remote procedure call handling - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * Maximum lifetime of a call (in jiffies). - */ -unsigned int rxrpc_max_call_lifetime = 60 * HZ; - -/* - * Time till dead call expires after last use (in jiffies). - */ -unsigned int rxrpc_dead_call_expiry = 2 * HZ; - -const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { - [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", - [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", - [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", - [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK", - [RXRPC_CALL_SERVER_SECURING] = "SvSecure", - [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept", - [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", - [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", - [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", - [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK", - [RXRPC_CALL_COMPLETE] = "Complete", - [RXRPC_CALL_SERVER_BUSY] = "SvBusy ", - [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort", - [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort", - [RXRPC_CALL_NETWORK_ERROR] = "NetError", - [RXRPC_CALL_DEAD] = "Dead ", -}; - -struct kmem_cache *rxrpc_call_jar; -LIST_HEAD(rxrpc_calls); -DEFINE_RWLOCK(rxrpc_call_lock); - -static void rxrpc_destroy_call(struct work_struct *work); -static void rxrpc_call_life_expired(unsigned long _call); -static void rxrpc_dead_call_expired(unsigned long _call); -static void rxrpc_ack_time_expired(unsigned long _call); -static void rxrpc_resend_time_expired(unsigned long _call); - -static DEFINE_SPINLOCK(rxrpc_call_hash_lock); -static DEFINE_HASHTABLE(rxrpc_call_hash, 10); - -/* - * Hash function for rxrpc_call_hash - */ -static unsigned long rxrpc_call_hashfunc( - u8 in_clientflag, - u32 cid, - u32 call_id, - u32 epoch, - u16 service_id, - sa_family_t proto, - void *localptr, - unsigned int addr_size, - const u8 *peer_addr) -{ - const u16 *p; - unsigned int i; - unsigned long key; - - _enter(""); - - key = (unsigned long)localptr; - /* We just want to add up the __be32 values, so forcing the - * cast should be okay. - */ - key += epoch; - key += service_id; - key += call_id; - key += (cid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT; - key += cid & RXRPC_CHANNELMASK; - key += in_clientflag; - key += proto; - /* Step through the peer address in 16-bit portions for speed */ - for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++) - key += *p; - _leave(" key = 0x%lx", key); - return key; -} - -/* - * Add a call to the hashtable - */ -static void rxrpc_call_hash_add(struct rxrpc_call *call) -{ - unsigned long key; - unsigned int addr_size = 0; - - _enter(""); - switch (call->proto) { - case AF_INET: - addr_size = sizeof(call->peer_ip.ipv4_addr); - break; - case AF_INET6: - addr_size = sizeof(call->peer_ip.ipv6_addr); - break; - default: - break; - } - key = rxrpc_call_hashfunc(call->in_clientflag, call->cid, - call->call_id, call->epoch, - call->service_id, call->proto, - call->conn->trans->local, addr_size, - call->peer_ip.ipv6_addr); - /* Store the full key in the call */ - call->hash_key = key; - spin_lock(&rxrpc_call_hash_lock); - hash_add_rcu(rxrpc_call_hash, &call->hash_node, key); - spin_unlock(&rxrpc_call_hash_lock); - _leave(""); -} - -/* - * Remove a call from the hashtable - */ -static void rxrpc_call_hash_del(struct rxrpc_call *call) -{ - _enter(""); - spin_lock(&rxrpc_call_hash_lock); - hash_del_rcu(&call->hash_node); - spin_unlock(&rxrpc_call_hash_lock); - _leave(""); -} - -/* - * Find a call in the hashtable and return it, or NULL if it - * isn't there. - */ -struct rxrpc_call *rxrpc_find_call_hash( - struct rxrpc_host_header *hdr, - void *localptr, - sa_family_t proto, - const void *peer_addr) -{ - unsigned long key; - unsigned int addr_size = 0; - struct rxrpc_call *call = NULL; - struct rxrpc_call *ret = NULL; - u8 in_clientflag = hdr->flags & RXRPC_CLIENT_INITIATED; - - _enter(""); - switch (proto) { - case AF_INET: - addr_size = sizeof(call->peer_ip.ipv4_addr); - break; - case AF_INET6: - addr_size = sizeof(call->peer_ip.ipv6_addr); - break; - default: - break; - } - - key = rxrpc_call_hashfunc(in_clientflag, hdr->cid, hdr->callNumber, - hdr->epoch, hdr->serviceId, - proto, localptr, addr_size, - peer_addr); - hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) { - if (call->hash_key == key && - call->call_id == hdr->callNumber && - call->cid == hdr->cid && - call->in_clientflag == in_clientflag && - call->service_id == hdr->serviceId && - call->proto == proto && - call->local == localptr && - memcmp(call->peer_ip.ipv6_addr, peer_addr, - addr_size) == 0 && - call->epoch == hdr->epoch) { - ret = call; - break; - } - } - _leave(" = %p", ret); - return ret; -} - -/* - * find an extant server call - * - called in process context with IRQs enabled - */ -struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx, - unsigned long user_call_ID) -{ - struct rxrpc_call *call; - struct rb_node *p; - - _enter("%p,%lx", rx, user_call_ID); - - read_lock(&rx->call_lock); - - p = rx->calls.rb_node; - while (p) { - call = rb_entry(p, struct rxrpc_call, sock_node); - - if (user_call_ID < call->user_call_ID) - p = p->rb_left; - else if (user_call_ID > call->user_call_ID) - p = p->rb_right; - else - goto found_extant_call; - } - - read_unlock(&rx->call_lock); - _leave(" = NULL"); - return NULL; - -found_extant_call: - rxrpc_get_call(call); - read_unlock(&rx->call_lock); - _leave(" = %p [%d]", call, atomic_read(&call->usage)); - return call; -} - -/* - * allocate a new call - */ -static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) -{ - struct rxrpc_call *call; - - call = kmem_cache_zalloc(rxrpc_call_jar, gfp); - if (!call) - return NULL; - - call->acks_winsz = 16; - call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long), - gfp); - if (!call->acks_window) { - kmem_cache_free(rxrpc_call_jar, call); - return NULL; - } - - setup_timer(&call->lifetimer, &rxrpc_call_life_expired, - (unsigned long) call); - setup_timer(&call->deadspan, &rxrpc_dead_call_expired, - (unsigned long) call); - setup_timer(&call->ack_timer, &rxrpc_ack_time_expired, - (unsigned long) call); - setup_timer(&call->resend_timer, &rxrpc_resend_time_expired, - (unsigned long) call); - INIT_WORK(&call->destroyer, &rxrpc_destroy_call); - INIT_WORK(&call->processor, &rxrpc_process_call); - INIT_LIST_HEAD(&call->accept_link); - skb_queue_head_init(&call->rx_queue); - skb_queue_head_init(&call->rx_oos_queue); - init_waitqueue_head(&call->tx_waitq); - spin_lock_init(&call->lock); - rwlock_init(&call->state_lock); - atomic_set(&call->usage, 1); - call->debug_id = atomic_inc_return(&rxrpc_debug_id); - call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; - - memset(&call->sock_node, 0xed, sizeof(call->sock_node)); - - call->rx_data_expect = 1; - call->rx_data_eaten = 0; - call->rx_first_oos = 0; - call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size; - call->creation_jif = jiffies; - return call; -} - -/* - * allocate a new client call and attempt to get a connection slot for it - */ -static struct rxrpc_call *rxrpc_alloc_client_call( - struct rxrpc_sock *rx, - struct rxrpc_transport *trans, - struct rxrpc_conn_bundle *bundle, - gfp_t gfp) -{ - struct rxrpc_call *call; - int ret; - - _enter(""); - - ASSERT(rx != NULL); - ASSERT(trans != NULL); - ASSERT(bundle != NULL); - - call = rxrpc_alloc_call(gfp); - if (!call) - return ERR_PTR(-ENOMEM); - - sock_hold(&rx->sk); - call->socket = rx; - call->rx_data_post = 1; - - ret = rxrpc_connect_call(rx, trans, bundle, call, gfp); - if (ret < 0) { - kmem_cache_free(rxrpc_call_jar, call); - return ERR_PTR(ret); - } - - /* Record copies of information for hashtable lookup */ - call->proto = rx->proto; - call->local = trans->local; - switch (call->proto) { - case AF_INET: - call->peer_ip.ipv4_addr = - trans->peer->srx.transport.sin.sin_addr.s_addr; - break; - case AF_INET6: - memcpy(call->peer_ip.ipv6_addr, - trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, - sizeof(call->peer_ip.ipv6_addr)); - break; - } - call->epoch = call->conn->epoch; - call->service_id = call->conn->service_id; - call->in_clientflag = call->conn->in_clientflag; - /* Add the new call to the hashtable */ - rxrpc_call_hash_add(call); - - spin_lock(&call->conn->trans->peer->lock); - list_add(&call->error_link, &call->conn->trans->peer->error_targets); - spin_unlock(&call->conn->trans->peer->lock); - - call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; - add_timer(&call->lifetimer); - - _leave(" = %p", call); - return call; -} - -/* - * set up a call for the given data - * - called in process context with IRQs enabled - */ -struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, - struct rxrpc_transport *trans, - struct rxrpc_conn_bundle *bundle, - unsigned long user_call_ID, - gfp_t gfp) -{ - struct rxrpc_call *call, *xcall; - struct rb_node *parent, **pp; - - _enter("%p,%d,%d,%lx", - rx, trans->debug_id, bundle ? bundle->debug_id : -1, - user_call_ID); - - call = rxrpc_alloc_client_call(rx, trans, bundle, gfp); - if (IS_ERR(call)) { - _leave(" = %ld", PTR_ERR(call)); - return call; - } - - call->user_call_ID = user_call_ID; - __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); - - write_lock(&rx->call_lock); - - pp = &rx->calls.rb_node; - parent = NULL; - while (*pp) { - parent = *pp; - xcall = rb_entry(parent, struct rxrpc_call, sock_node); - - if (user_call_ID < xcall->user_call_ID) - pp = &(*pp)->rb_left; - else if (user_call_ID > xcall->user_call_ID) - pp = &(*pp)->rb_right; - else - goto found_user_ID_now_present; - } - - rxrpc_get_call(call); - - rb_link_node(&call->sock_node, parent, pp); - rb_insert_color(&call->sock_node, &rx->calls); - write_unlock(&rx->call_lock); - - write_lock_bh(&rxrpc_call_lock); - list_add_tail(&call->link, &rxrpc_calls); - write_unlock_bh(&rxrpc_call_lock); - - _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); - - _leave(" = %p [new]", call); - return call; - - /* We unexpectedly found the user ID in the list after taking - * the call_lock. This shouldn't happen unless the user races - * with itself and tries to add the same user ID twice at the - * same time in different threads. - */ -found_user_ID_now_present: - write_unlock(&rx->call_lock); - rxrpc_put_call(call); - _leave(" = -EEXIST [%p]", call); - return ERR_PTR(-EEXIST); -} - -/* - * set up an incoming call - * - called in process context with IRQs enabled - */ -struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, - struct rxrpc_connection *conn, - struct rxrpc_host_header *hdr) -{ - struct rxrpc_call *call, *candidate; - struct rb_node **p, *parent; - u32 call_id; - - _enter(",%d", conn->debug_id); - - ASSERT(rx != NULL); - - candidate = rxrpc_alloc_call(GFP_NOIO); - if (!candidate) - return ERR_PTR(-EBUSY); - - candidate->socket = rx; - candidate->conn = conn; - candidate->cid = hdr->cid; - candidate->call_id = hdr->callNumber; - candidate->channel = hdr->cid & RXRPC_CHANNELMASK; - candidate->rx_data_post = 0; - candidate->state = RXRPC_CALL_SERVER_ACCEPTING; - if (conn->security_ix > 0) - candidate->state = RXRPC_CALL_SERVER_SECURING; - - write_lock_bh(&conn->lock); - - /* set the channel for this call */ - call = conn->channels[candidate->channel]; - _debug("channel[%u] is %p", candidate->channel, call); - if (call && call->call_id == hdr->callNumber) { - /* already set; must've been a duplicate packet */ - _debug("extant call [%d]", call->state); - ASSERTCMP(call->conn, ==, conn); - - read_lock(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) - rxrpc_queue_call(call); - case RXRPC_CALL_REMOTELY_ABORTED: - read_unlock(&call->state_lock); - goto aborted_call; - default: - rxrpc_get_call(call); - read_unlock(&call->state_lock); - goto extant_call; - } - } - - if (call) { - /* it seems the channel is still in use from the previous call - * - ditch the old binding if its call is now complete */ - _debug("CALL: %u { %s }", - call->debug_id, rxrpc_call_states[call->state]); - - if (call->state >= RXRPC_CALL_COMPLETE) { - conn->channels[call->channel] = NULL; - } else { - write_unlock_bh(&conn->lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = -EBUSY"); - return ERR_PTR(-EBUSY); - } - } - - /* check the call number isn't duplicate */ - _debug("check dup"); - call_id = hdr->callNumber; - p = &conn->calls.rb_node; - parent = NULL; - while (*p) { - parent = *p; - call = rb_entry(parent, struct rxrpc_call, conn_node); - - /* The tree is sorted in order of the __be32 value without - * turning it into host order. - */ - if (call_id < call->call_id) - p = &(*p)->rb_left; - else if (call_id > call->call_id) - p = &(*p)->rb_right; - else - goto old_call; - } - - /* make the call available */ - _debug("new call"); - call = candidate; - candidate = NULL; - rb_link_node(&call->conn_node, parent, p); - rb_insert_color(&call->conn_node, &conn->calls); - conn->channels[call->channel] = call; - sock_hold(&rx->sk); - atomic_inc(&conn->usage); - write_unlock_bh(&conn->lock); - - spin_lock(&conn->trans->peer->lock); - list_add(&call->error_link, &conn->trans->peer->error_targets); - spin_unlock(&conn->trans->peer->lock); - - write_lock_bh(&rxrpc_call_lock); - list_add_tail(&call->link, &rxrpc_calls); - write_unlock_bh(&rxrpc_call_lock); - - /* Record copies of information for hashtable lookup */ - call->proto = rx->proto; - call->local = conn->trans->local; - switch (call->proto) { - case AF_INET: - call->peer_ip.ipv4_addr = - conn->trans->peer->srx.transport.sin.sin_addr.s_addr; - break; - case AF_INET6: - memcpy(call->peer_ip.ipv6_addr, - conn->trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, - sizeof(call->peer_ip.ipv6_addr)); - break; - default: - break; - } - call->epoch = conn->epoch; - call->service_id = conn->service_id; - call->in_clientflag = conn->in_clientflag; - /* Add the new call to the hashtable */ - rxrpc_call_hash_add(call); - - _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); - - call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; - add_timer(&call->lifetimer); - _leave(" = %p {%d} [new]", call, call->debug_id); - return call; - -extant_call: - write_unlock_bh(&conn->lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1); - return call; - -aborted_call: - write_unlock_bh(&conn->lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = -ECONNABORTED"); - return ERR_PTR(-ECONNABORTED); - -old_call: - write_unlock_bh(&conn->lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = -ECONNRESET [old]"); - return ERR_PTR(-ECONNRESET); -} - -/* - * detach a call from a socket and set up for release - */ -void rxrpc_release_call(struct rxrpc_call *call) -{ - struct rxrpc_connection *conn = call->conn; - struct rxrpc_sock *rx = call->socket; - - _enter("{%d,%d,%d,%d}", - call->debug_id, atomic_read(&call->usage), - atomic_read(&call->ackr_not_idle), - call->rx_first_oos); - - spin_lock_bh(&call->lock); - if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) - BUG(); - spin_unlock_bh(&call->lock); - - /* dissociate from the socket - * - the socket's ref on the call is passed to the death timer - */ - _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); - - write_lock_bh(&rx->call_lock); - if (!list_empty(&call->accept_link)) { - _debug("unlinking once-pending call %p { e=%lx f=%lx }", - call, call->events, call->flags); - ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); - list_del_init(&call->accept_link); - sk_acceptq_removed(&rx->sk); - } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - rb_erase(&call->sock_node, &rx->calls); - memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); - clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); - } - write_unlock_bh(&rx->call_lock); - - /* free up the channel for reuse */ - spin_lock(&conn->trans->client_lock); - write_lock_bh(&conn->lock); - write_lock(&call->state_lock); - - if (conn->channels[call->channel] == call) - conn->channels[call->channel] = NULL; - - if (conn->out_clientflag && conn->bundle) { - conn->avail_calls++; - switch (conn->avail_calls) { - case 1: - list_move_tail(&conn->bundle_link, - &conn->bundle->avail_conns); - case 2 ... RXRPC_MAXCALLS - 1: - ASSERT(conn->channels[0] == NULL || - conn->channels[1] == NULL || - conn->channels[2] == NULL || - conn->channels[3] == NULL); - break; - case RXRPC_MAXCALLS: - list_move_tail(&conn->bundle_link, - &conn->bundle->unused_conns); - ASSERT(conn->channels[0] == NULL && - conn->channels[1] == NULL && - conn->channels[2] == NULL && - conn->channels[3] == NULL); - break; - default: - pr_err("conn->avail_calls=%d\n", conn->avail_calls); - BUG(); - } - } - - spin_unlock(&conn->trans->client_lock); - - if (call->state < RXRPC_CALL_COMPLETE && - call->state != RXRPC_CALL_CLIENT_FINAL_ACK) { - _debug("+++ ABORTING STATE %d +++\n", call->state); - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = RX_CALL_DEAD; - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - rxrpc_queue_call(call); - } - write_unlock(&call->state_lock); - write_unlock_bh(&conn->lock); - - /* clean up the Rx queue */ - if (!skb_queue_empty(&call->rx_queue) || - !skb_queue_empty(&call->rx_oos_queue)) { - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - - _debug("purge Rx queues"); - - spin_lock_bh(&call->lock); - while ((skb = skb_dequeue(&call->rx_queue)) || - (skb = skb_dequeue(&call->rx_oos_queue))) { - sp = rxrpc_skb(skb); - if (sp->call) { - ASSERTCMP(sp->call, ==, call); - rxrpc_put_call(call); - sp->call = NULL; - } - skb->destructor = NULL; - spin_unlock_bh(&call->lock); - - _debug("- zap %s %%%u #%u", - rxrpc_pkts[sp->hdr.type], - sp->hdr.serial, sp->hdr.seq); - rxrpc_free_skb(skb); - spin_lock_bh(&call->lock); - } - spin_unlock_bh(&call->lock); - - ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE); - } - - del_timer_sync(&call->resend_timer); - del_timer_sync(&call->ack_timer); - del_timer_sync(&call->lifetimer); - call->deadspan.expires = jiffies + rxrpc_dead_call_expiry; - add_timer(&call->deadspan); - - _leave(""); -} - -/* - * handle a dead call being ready for reaping - */ -static void rxrpc_dead_call_expired(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _enter("{%d}", call->debug_id); - - write_lock_bh(&call->state_lock); - call->state = RXRPC_CALL_DEAD; - write_unlock_bh(&call->state_lock); - rxrpc_put_call(call); -} - -/* - * mark a call as to be released, aborting it if it's still in progress - * - called with softirqs disabled - */ -static void rxrpc_mark_call_released(struct rxrpc_call *call) -{ - bool sched; - - write_lock(&call->state_lock); - if (call->state < RXRPC_CALL_DEAD) { - sched = false; - if (call->state < RXRPC_CALL_COMPLETE) { - _debug("abort call %p", call); - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = RX_CALL_DEAD; - if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) - sched = true; - } - if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) - sched = true; - if (sched) - rxrpc_queue_call(call); - } - write_unlock(&call->state_lock); -} - -/* - * release all the calls associated with a socket - */ -void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) -{ - struct rxrpc_call *call; - struct rb_node *p; - - _enter("%p", rx); - - read_lock_bh(&rx->call_lock); - - /* mark all the calls as no longer wanting incoming packets */ - for (p = rb_first(&rx->calls); p; p = rb_next(p)) { - call = rb_entry(p, struct rxrpc_call, sock_node); - rxrpc_mark_call_released(call); - } - - /* kill the not-yet-accepted incoming calls */ - list_for_each_entry(call, &rx->secureq, accept_link) { - rxrpc_mark_call_released(call); - } - - list_for_each_entry(call, &rx->acceptq, accept_link) { - rxrpc_mark_call_released(call); - } - - read_unlock_bh(&rx->call_lock); - _leave(""); -} - -/* - * release a call - */ -void __rxrpc_put_call(struct rxrpc_call *call) -{ - ASSERT(call != NULL); - - _enter("%p{u=%d}", call, atomic_read(&call->usage)); - - ASSERTCMP(atomic_read(&call->usage), >, 0); - - if (atomic_dec_and_test(&call->usage)) { - _debug("call %d dead", call->debug_id); - ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); - rxrpc_queue_work(&call->destroyer); - } - _leave(""); -} - -/* - * clean up a call - */ -static void rxrpc_cleanup_call(struct rxrpc_call *call) -{ - _net("DESTROY CALL %d", call->debug_id); - - ASSERT(call->socket); - - memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); - - del_timer_sync(&call->lifetimer); - del_timer_sync(&call->deadspan); - del_timer_sync(&call->ack_timer); - del_timer_sync(&call->resend_timer); - - ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); - ASSERTCMP(call->events, ==, 0); - if (work_pending(&call->processor)) { - _debug("defer destroy"); - rxrpc_queue_work(&call->destroyer); - return; - } - - if (call->conn) { - spin_lock(&call->conn->trans->peer->lock); - list_del(&call->error_link); - spin_unlock(&call->conn->trans->peer->lock); - - write_lock_bh(&call->conn->lock); - rb_erase(&call->conn_node, &call->conn->calls); - write_unlock_bh(&call->conn->lock); - rxrpc_put_connection(call->conn); - } - - /* Remove the call from the hash */ - rxrpc_call_hash_del(call); - - if (call->acks_window) { - _debug("kill Tx window %d", - CIRC_CNT(call->acks_head, call->acks_tail, - call->acks_winsz)); - smp_mb(); - while (CIRC_CNT(call->acks_head, call->acks_tail, - call->acks_winsz) > 0) { - struct rxrpc_skb_priv *sp; - unsigned long _skb; - - _skb = call->acks_window[call->acks_tail] & ~1; - sp = rxrpc_skb((struct sk_buff *)_skb); - _debug("+++ clear Tx %u", sp->hdr.seq); - rxrpc_free_skb((struct sk_buff *)_skb); - call->acks_tail = - (call->acks_tail + 1) & (call->acks_winsz - 1); - } - - kfree(call->acks_window); - } - - rxrpc_free_skb(call->tx_pending); - - rxrpc_purge_queue(&call->rx_queue); - ASSERT(skb_queue_empty(&call->rx_oos_queue)); - sock_put(&call->socket->sk); - kmem_cache_free(rxrpc_call_jar, call); -} - -/* - * destroy a call - */ -static void rxrpc_destroy_call(struct work_struct *work) -{ - struct rxrpc_call *call = - container_of(work, struct rxrpc_call, destroyer); - - _enter("%p{%d,%d,%p}", - call, atomic_read(&call->usage), call->channel, call->conn); - - ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); - - write_lock_bh(&rxrpc_call_lock); - list_del_init(&call->link); - write_unlock_bh(&rxrpc_call_lock); - - rxrpc_cleanup_call(call); - _leave(""); -} - -/* - * preemptively destroy all the call records from a transport endpoint rather - * than waiting for them to time out - */ -void __exit rxrpc_destroy_all_calls(void) -{ - struct rxrpc_call *call; - - _enter(""); - write_lock_bh(&rxrpc_call_lock); - - while (!list_empty(&rxrpc_calls)) { - call = list_entry(rxrpc_calls.next, struct rxrpc_call, link); - _debug("Zapping call %p", call); - - list_del_init(&call->link); - - switch (atomic_read(&call->usage)) { - case 0: - ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); - break; - case 1: - if (del_timer_sync(&call->deadspan) != 0 && - call->state != RXRPC_CALL_DEAD) - rxrpc_dead_call_expired((unsigned long) call); - if (call->state != RXRPC_CALL_DEAD) - break; - default: - pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n", - call, atomic_read(&call->usage), - atomic_read(&call->ackr_not_idle), - rxrpc_call_states[call->state], - call->flags, call->events); - if (!skb_queue_empty(&call->rx_queue)) - pr_err("Rx queue occupied\n"); - if (!skb_queue_empty(&call->rx_oos_queue)) - pr_err("OOS queue occupied\n"); - break; - } - - write_unlock_bh(&rxrpc_call_lock); - cond_resched(); - write_lock_bh(&rxrpc_call_lock); - } - - write_unlock_bh(&rxrpc_call_lock); - _leave(""); -} - -/* - * handle call lifetime being exceeded - */ -static void rxrpc_call_life_expired(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - if (call->state >= RXRPC_CALL_COMPLETE) - return; - - _enter("{%d}", call->debug_id); - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) { - set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); - rxrpc_queue_call(call); - } - read_unlock_bh(&call->state_lock); -} - -/* - * handle resend timer expiry - * - may not take call->state_lock as this can deadlock against del_timer_sync() - */ -static void rxrpc_resend_time_expired(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _enter("{%d}", call->debug_id); - - if (call->state >= RXRPC_CALL_COMPLETE) - return; - - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) - rxrpc_queue_call(call); -} - -/* - * handle ACK timer expiry - */ -static void rxrpc_ack_time_expired(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _enter("{%d}", call->debug_id); - - if (call->state >= RXRPC_CALL_COMPLETE) - return; - - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); -} diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c deleted file mode 100644 index 8ecde4b77b55..000000000000 --- a/net/rxrpc/ar-connection.c +++ /dev/null @@ -1,912 +0,0 @@ -/* RxRPC virtual connection handler - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * Time till a connection expires after last use (in seconds). - */ -unsigned int rxrpc_connection_expiry = 10 * 60; - -static void rxrpc_connection_reaper(struct work_struct *work); - -LIST_HEAD(rxrpc_connections); -DEFINE_RWLOCK(rxrpc_connection_lock); -static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); - -/* - * allocate a new client connection bundle - */ -static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp) -{ - struct rxrpc_conn_bundle *bundle; - - _enter(""); - - bundle = kzalloc(sizeof(struct rxrpc_conn_bundle), gfp); - if (bundle) { - INIT_LIST_HEAD(&bundle->unused_conns); - INIT_LIST_HEAD(&bundle->avail_conns); - INIT_LIST_HEAD(&bundle->busy_conns); - init_waitqueue_head(&bundle->chanwait); - atomic_set(&bundle->usage, 1); - } - - _leave(" = %p", bundle); - return bundle; -} - -/* - * compare bundle parameters with what we're looking for - * - return -ve, 0 or +ve - */ -static inline -int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle, - struct key *key, u16 service_id) -{ - return (bundle->service_id - service_id) ?: - ((unsigned long)bundle->key - (unsigned long)key); -} - -/* - * get bundle of client connections that a client socket can make use of - */ -struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx, - struct rxrpc_transport *trans, - struct key *key, - u16 service_id, - gfp_t gfp) -{ - struct rxrpc_conn_bundle *bundle, *candidate; - struct rb_node *p, *parent, **pp; - - _enter("%p{%x},%x,%hx,", - rx, key_serial(key), trans->debug_id, service_id); - - /* search the extant bundles first for one that matches the specified - * user ID */ - spin_lock(&trans->client_lock); - - p = trans->bundles.rb_node; - while (p) { - bundle = rb_entry(p, struct rxrpc_conn_bundle, node); - - if (rxrpc_cmp_bundle(bundle, key, service_id) < 0) - p = p->rb_left; - else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0) - p = p->rb_right; - else - goto found_extant_bundle; - } - - spin_unlock(&trans->client_lock); - - /* not yet present - create a candidate for a new record and then - * redo the search */ - candidate = rxrpc_alloc_bundle(gfp); - if (!candidate) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } - - candidate->key = key_get(key); - candidate->service_id = service_id; - - spin_lock(&trans->client_lock); - - pp = &trans->bundles.rb_node; - parent = NULL; - while (*pp) { - parent = *pp; - bundle = rb_entry(parent, struct rxrpc_conn_bundle, node); - - if (rxrpc_cmp_bundle(bundle, key, service_id) < 0) - pp = &(*pp)->rb_left; - else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0) - pp = &(*pp)->rb_right; - else - goto found_extant_second; - } - - /* second search also failed; add the new bundle */ - bundle = candidate; - candidate = NULL; - - rb_link_node(&bundle->node, parent, pp); - rb_insert_color(&bundle->node, &trans->bundles); - spin_unlock(&trans->client_lock); - _net("BUNDLE new on trans %d", trans->debug_id); - _leave(" = %p [new]", bundle); - return bundle; - - /* we found the bundle in the list immediately */ -found_extant_bundle: - atomic_inc(&bundle->usage); - spin_unlock(&trans->client_lock); - _net("BUNDLE old on trans %d", trans->debug_id); - _leave(" = %p [extant %d]", bundle, atomic_read(&bundle->usage)); - return bundle; - - /* we found the bundle on the second time through the list */ -found_extant_second: - atomic_inc(&bundle->usage); - spin_unlock(&trans->client_lock); - kfree(candidate); - _net("BUNDLE old2 on trans %d", trans->debug_id); - _leave(" = %p [second %d]", bundle, atomic_read(&bundle->usage)); - return bundle; -} - -/* - * release a bundle - */ -void rxrpc_put_bundle(struct rxrpc_transport *trans, - struct rxrpc_conn_bundle *bundle) -{ - _enter("%p,%p{%d}",trans, bundle, atomic_read(&bundle->usage)); - - if (atomic_dec_and_lock(&bundle->usage, &trans->client_lock)) { - _debug("Destroy bundle"); - rb_erase(&bundle->node, &trans->bundles); - spin_unlock(&trans->client_lock); - ASSERT(list_empty(&bundle->unused_conns)); - ASSERT(list_empty(&bundle->avail_conns)); - ASSERT(list_empty(&bundle->busy_conns)); - ASSERTCMP(bundle->num_conns, ==, 0); - key_put(bundle->key); - kfree(bundle); - } - - _leave(""); -} - -/* - * allocate a new connection - */ -static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) -{ - struct rxrpc_connection *conn; - - _enter(""); - - conn = kzalloc(sizeof(struct rxrpc_connection), gfp); - if (conn) { - INIT_WORK(&conn->processor, &rxrpc_process_connection); - INIT_LIST_HEAD(&conn->bundle_link); - conn->calls = RB_ROOT; - skb_queue_head_init(&conn->rx_queue); - conn->security = &rxrpc_no_security; - rwlock_init(&conn->lock); - spin_lock_init(&conn->state_lock); - atomic_set(&conn->usage, 1); - conn->debug_id = atomic_inc_return(&rxrpc_debug_id); - conn->avail_calls = RXRPC_MAXCALLS; - conn->size_align = 4; - conn->header_size = sizeof(struct rxrpc_wire_header); - } - - _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0); - return conn; -} - -/* - * assign a connection ID to a connection and add it to the transport's - * connection lookup tree - * - called with transport client lock held - */ -static void rxrpc_assign_connection_id(struct rxrpc_connection *conn) -{ - struct rxrpc_connection *xconn; - struct rb_node *parent, **p; - __be32 epoch; - u32 cid; - - _enter(""); - - epoch = conn->epoch; - - write_lock_bh(&conn->trans->conn_lock); - - conn->trans->conn_idcounter += RXRPC_CID_INC; - if (conn->trans->conn_idcounter < RXRPC_CID_INC) - conn->trans->conn_idcounter = RXRPC_CID_INC; - cid = conn->trans->conn_idcounter; - -attempt_insertion: - parent = NULL; - p = &conn->trans->client_conns.rb_node; - - while (*p) { - parent = *p; - xconn = rb_entry(parent, struct rxrpc_connection, node); - - if (epoch < xconn->epoch) - p = &(*p)->rb_left; - else if (epoch > xconn->epoch) - p = &(*p)->rb_right; - else if (cid < xconn->cid) - p = &(*p)->rb_left; - else if (cid > xconn->cid) - p = &(*p)->rb_right; - else - goto id_exists; - } - - /* we've found a suitable hole - arrange for this connection to occupy - * it */ - rb_link_node(&conn->node, parent, p); - rb_insert_color(&conn->node, &conn->trans->client_conns); - - conn->cid = cid; - write_unlock_bh(&conn->trans->conn_lock); - _leave(" [CID %x]", cid); - return; - - /* we found a connection with the proposed ID - walk the tree from that - * point looking for the next unused ID */ -id_exists: - for (;;) { - cid += RXRPC_CID_INC; - if (cid < RXRPC_CID_INC) { - cid = RXRPC_CID_INC; - conn->trans->conn_idcounter = cid; - goto attempt_insertion; - } - - parent = rb_next(parent); - if (!parent) - goto attempt_insertion; - - xconn = rb_entry(parent, struct rxrpc_connection, node); - if (epoch < xconn->epoch || - cid < xconn->cid) - goto attempt_insertion; - } -} - -/* - * add a call to a connection's call-by-ID tree - */ -static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, - struct rxrpc_call *call) -{ - struct rxrpc_call *xcall; - struct rb_node *parent, **p; - __be32 call_id; - - write_lock_bh(&conn->lock); - - call_id = call->call_id; - p = &conn->calls.rb_node; - parent = NULL; - while (*p) { - parent = *p; - xcall = rb_entry(parent, struct rxrpc_call, conn_node); - - if (call_id < xcall->call_id) - p = &(*p)->rb_left; - else if (call_id > xcall->call_id) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&call->conn_node, parent, p); - rb_insert_color(&call->conn_node, &conn->calls); - - write_unlock_bh(&conn->lock); -} - -/* - * connect a call on an exclusive connection - */ -static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, - struct rxrpc_transport *trans, - u16 service_id, - struct rxrpc_call *call, - gfp_t gfp) -{ - struct rxrpc_connection *conn; - int chan, ret; - - _enter(""); - - conn = rx->conn; - if (!conn) { - /* not yet present - create a candidate for a new connection - * and then redo the check */ - conn = rxrpc_alloc_connection(gfp); - if (!conn) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - conn->trans = trans; - conn->bundle = NULL; - conn->service_id = service_id; - conn->epoch = rxrpc_epoch; - conn->in_clientflag = 0; - conn->out_clientflag = RXRPC_CLIENT_INITIATED; - conn->cid = 0; - conn->state = RXRPC_CONN_CLIENT; - conn->avail_calls = RXRPC_MAXCALLS - 1; - conn->security_level = rx->min_sec_level; - conn->key = key_get(rx->key); - - ret = rxrpc_init_client_conn_security(conn); - if (ret < 0) { - key_put(conn->key); - kfree(conn); - _leave(" = %d [key]", ret); - return ret; - } - - write_lock_bh(&rxrpc_connection_lock); - list_add_tail(&conn->link, &rxrpc_connections); - write_unlock_bh(&rxrpc_connection_lock); - - spin_lock(&trans->client_lock); - atomic_inc(&trans->usage); - - _net("CONNECT EXCL new %d on TRANS %d", - conn->debug_id, conn->trans->debug_id); - - rxrpc_assign_connection_id(conn); - rx->conn = conn; - } else { - spin_lock(&trans->client_lock); - } - - /* we've got a connection with a free channel and we can now attach the - * call to it - * - we're holding the transport's client lock - * - we're holding a reference on the connection - */ - for (chan = 0; chan < RXRPC_MAXCALLS; chan++) - if (!conn->channels[chan]) - goto found_channel; - goto no_free_channels; - -found_channel: - atomic_inc(&conn->usage); - conn->channels[chan] = call; - call->conn = conn; - call->channel = chan; - call->cid = conn->cid | chan; - call->call_id = ++conn->call_counter; - - _net("CONNECT client on conn %d chan %d as call %x", - conn->debug_id, chan, call->call_id); - - spin_unlock(&trans->client_lock); - - rxrpc_add_call_ID_to_conn(conn, call); - _leave(" = 0"); - return 0; - -no_free_channels: - spin_unlock(&trans->client_lock); - _leave(" = -ENOSR"); - return -ENOSR; -} - -/* - * find a connection for a call - * - called in process context with IRQs enabled - */ -int rxrpc_connect_call(struct rxrpc_sock *rx, - struct rxrpc_transport *trans, - struct rxrpc_conn_bundle *bundle, - struct rxrpc_call *call, - gfp_t gfp) -{ - struct rxrpc_connection *conn, *candidate; - int chan, ret; - - DECLARE_WAITQUEUE(myself, current); - - _enter("%p,%lx,", rx, call->user_call_ID); - - if (test_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags)) - return rxrpc_connect_exclusive(rx, trans, bundle->service_id, - call, gfp); - - spin_lock(&trans->client_lock); - for (;;) { - /* see if the bundle has a call slot available */ - if (!list_empty(&bundle->avail_conns)) { - _debug("avail"); - conn = list_entry(bundle->avail_conns.next, - struct rxrpc_connection, - bundle_link); - if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - list_del_init(&conn->bundle_link); - bundle->num_conns--; - continue; - } - if (--conn->avail_calls == 0) - list_move(&conn->bundle_link, - &bundle->busy_conns); - ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); - ASSERT(conn->channels[0] == NULL || - conn->channels[1] == NULL || - conn->channels[2] == NULL || - conn->channels[3] == NULL); - atomic_inc(&conn->usage); - break; - } - - if (!list_empty(&bundle->unused_conns)) { - _debug("unused"); - conn = list_entry(bundle->unused_conns.next, - struct rxrpc_connection, - bundle_link); - if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - list_del_init(&conn->bundle_link); - bundle->num_conns--; - continue; - } - ASSERTCMP(conn->avail_calls, ==, RXRPC_MAXCALLS); - conn->avail_calls = RXRPC_MAXCALLS - 1; - ASSERT(conn->channels[0] == NULL && - conn->channels[1] == NULL && - conn->channels[2] == NULL && - conn->channels[3] == NULL); - atomic_inc(&conn->usage); - list_move(&conn->bundle_link, &bundle->avail_conns); - break; - } - - /* need to allocate a new connection */ - _debug("get new conn [%d]", bundle->num_conns); - - spin_unlock(&trans->client_lock); - - if (signal_pending(current)) - goto interrupted; - - if (bundle->num_conns >= 20) { - _debug("too many conns"); - - if (!gfpflags_allow_blocking(gfp)) { - _leave(" = -EAGAIN"); - return -EAGAIN; - } - - add_wait_queue(&bundle->chanwait, &myself); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (bundle->num_conns < 20 || - !list_empty(&bundle->unused_conns) || - !list_empty(&bundle->avail_conns)) - break; - if (signal_pending(current)) - goto interrupted_dequeue; - schedule(); - } - remove_wait_queue(&bundle->chanwait, &myself); - __set_current_state(TASK_RUNNING); - spin_lock(&trans->client_lock); - continue; - } - - /* not yet present - create a candidate for a new connection and then - * redo the check */ - candidate = rxrpc_alloc_connection(gfp); - if (!candidate) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - candidate->trans = trans; - candidate->bundle = bundle; - candidate->service_id = bundle->service_id; - candidate->epoch = rxrpc_epoch; - candidate->in_clientflag = 0; - candidate->out_clientflag = RXRPC_CLIENT_INITIATED; - candidate->cid = 0; - candidate->state = RXRPC_CONN_CLIENT; - candidate->avail_calls = RXRPC_MAXCALLS; - candidate->security_level = rx->min_sec_level; - candidate->key = key_get(bundle->key); - - ret = rxrpc_init_client_conn_security(candidate); - if (ret < 0) { - key_put(candidate->key); - kfree(candidate); - _leave(" = %d [key]", ret); - return ret; - } - - write_lock_bh(&rxrpc_connection_lock); - list_add_tail(&candidate->link, &rxrpc_connections); - write_unlock_bh(&rxrpc_connection_lock); - - spin_lock(&trans->client_lock); - - list_add(&candidate->bundle_link, &bundle->unused_conns); - bundle->num_conns++; - atomic_inc(&bundle->usage); - atomic_inc(&trans->usage); - - _net("CONNECT new %d on TRANS %d", - candidate->debug_id, candidate->trans->debug_id); - - rxrpc_assign_connection_id(candidate); - candidate->security->prime_packet_security(candidate); - - /* leave the candidate lurking in zombie mode attached to the - * bundle until we're ready for it */ - rxrpc_put_connection(candidate); - candidate = NULL; - } - - /* we've got a connection with a free channel and we can now attach the - * call to it - * - we're holding the transport's client lock - * - we're holding a reference on the connection - * - we're holding a reference on the bundle - */ - for (chan = 0; chan < RXRPC_MAXCALLS; chan++) - if (!conn->channels[chan]) - goto found_channel; - ASSERT(conn->channels[0] == NULL || - conn->channels[1] == NULL || - conn->channels[2] == NULL || - conn->channels[3] == NULL); - BUG(); - -found_channel: - conn->channels[chan] = call; - call->conn = conn; - call->channel = chan; - call->cid = conn->cid | chan; - call->call_id = ++conn->call_counter; - - _net("CONNECT client on conn %d chan %d as call %x", - conn->debug_id, chan, call->call_id); - - ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); - spin_unlock(&trans->client_lock); - - rxrpc_add_call_ID_to_conn(conn, call); - - _leave(" = 0"); - return 0; - -interrupted_dequeue: - remove_wait_queue(&bundle->chanwait, &myself); - __set_current_state(TASK_RUNNING); -interrupted: - _leave(" = -ERESTARTSYS"); - return -ERESTARTSYS; -} - -/* - * get a record of an incoming connection - */ -struct rxrpc_connection * -rxrpc_incoming_connection(struct rxrpc_transport *trans, - struct rxrpc_host_header *hdr) -{ - struct rxrpc_connection *conn, *candidate = NULL; - struct rb_node *p, **pp; - const char *new = "old"; - __be32 epoch; - u32 cid; - - _enter(""); - - ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED); - - epoch = hdr->epoch; - cid = hdr->cid & RXRPC_CIDMASK; - - /* search the connection list first */ - read_lock_bh(&trans->conn_lock); - - p = trans->server_conns.rb_node; - while (p) { - conn = rb_entry(p, struct rxrpc_connection, node); - - _debug("maybe %x", conn->cid); - - if (epoch < conn->epoch) - p = p->rb_left; - else if (epoch > conn->epoch) - p = p->rb_right; - else if (cid < conn->cid) - p = p->rb_left; - else if (cid > conn->cid) - p = p->rb_right; - else - goto found_extant_connection; - } - read_unlock_bh(&trans->conn_lock); - - /* not yet present - create a candidate for a new record and then - * redo the search */ - candidate = rxrpc_alloc_connection(GFP_NOIO); - if (!candidate) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } - - candidate->trans = trans; - candidate->epoch = hdr->epoch; - candidate->cid = hdr->cid & RXRPC_CIDMASK; - candidate->service_id = hdr->serviceId; - candidate->security_ix = hdr->securityIndex; - candidate->in_clientflag = RXRPC_CLIENT_INITIATED; - candidate->out_clientflag = 0; - candidate->state = RXRPC_CONN_SERVER; - if (candidate->service_id) - candidate->state = RXRPC_CONN_SERVER_UNSECURED; - - write_lock_bh(&trans->conn_lock); - - pp = &trans->server_conns.rb_node; - p = NULL; - while (*pp) { - p = *pp; - conn = rb_entry(p, struct rxrpc_connection, node); - - if (epoch < conn->epoch) - pp = &(*pp)->rb_left; - else if (epoch > conn->epoch) - pp = &(*pp)->rb_right; - else if (cid < conn->cid) - pp = &(*pp)->rb_left; - else if (cid > conn->cid) - pp = &(*pp)->rb_right; - else - goto found_extant_second; - } - - /* we can now add the new candidate to the list */ - conn = candidate; - candidate = NULL; - rb_link_node(&conn->node, p, pp); - rb_insert_color(&conn->node, &trans->server_conns); - atomic_inc(&conn->trans->usage); - - write_unlock_bh(&trans->conn_lock); - - write_lock_bh(&rxrpc_connection_lock); - list_add_tail(&conn->link, &rxrpc_connections); - write_unlock_bh(&rxrpc_connection_lock); - - new = "new"; - -success: - _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->cid); - - _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); - return conn; - - /* we found the connection in the list immediately */ -found_extant_connection: - if (hdr->securityIndex != conn->security_ix) { - read_unlock_bh(&trans->conn_lock); - goto security_mismatch; - } - atomic_inc(&conn->usage); - read_unlock_bh(&trans->conn_lock); - goto success; - - /* we found the connection on the second time through the list */ -found_extant_second: - if (hdr->securityIndex != conn->security_ix) { - write_unlock_bh(&trans->conn_lock); - goto security_mismatch; - } - atomic_inc(&conn->usage); - write_unlock_bh(&trans->conn_lock); - kfree(candidate); - goto success; - -security_mismatch: - kfree(candidate); - _leave(" = -EKEYREJECTED"); - return ERR_PTR(-EKEYREJECTED); -} - -/* - * find a connection based on transport and RxRPC connection ID for an incoming - * packet - */ -struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, - struct rxrpc_host_header *hdr) -{ - struct rxrpc_connection *conn; - struct rb_node *p; - u32 epoch, cid; - - _enter(",{%x,%x}", hdr->cid, hdr->flags); - - read_lock_bh(&trans->conn_lock); - - cid = hdr->cid & RXRPC_CIDMASK; - epoch = hdr->epoch; - - if (hdr->flags & RXRPC_CLIENT_INITIATED) - p = trans->server_conns.rb_node; - else - p = trans->client_conns.rb_node; - - while (p) { - conn = rb_entry(p, struct rxrpc_connection, node); - - _debug("maybe %x", conn->cid); - - if (epoch < conn->epoch) - p = p->rb_left; - else if (epoch > conn->epoch) - p = p->rb_right; - else if (cid < conn->cid) - p = p->rb_left; - else if (cid > conn->cid) - p = p->rb_right; - else - goto found; - } - - read_unlock_bh(&trans->conn_lock); - _leave(" = NULL"); - return NULL; - -found: - atomic_inc(&conn->usage); - read_unlock_bh(&trans->conn_lock); - _leave(" = %p", conn); - return conn; -} - -/* - * release a virtual connection - */ -void rxrpc_put_connection(struct rxrpc_connection *conn) -{ - _enter("%p{u=%d,d=%d}", - conn, atomic_read(&conn->usage), conn->debug_id); - - ASSERTCMP(atomic_read(&conn->usage), >, 0); - - conn->put_time = ktime_get_seconds(); - if (atomic_dec_and_test(&conn->usage)) { - _debug("zombie"); - rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); - } - - _leave(""); -} - -/* - * destroy a virtual connection - */ -static void rxrpc_destroy_connection(struct rxrpc_connection *conn) -{ - _enter("%p{%d}", conn, atomic_read(&conn->usage)); - - ASSERTCMP(atomic_read(&conn->usage), ==, 0); - - _net("DESTROY CONN %d", conn->debug_id); - - if (conn->bundle) - rxrpc_put_bundle(conn->trans, conn->bundle); - - ASSERT(RB_EMPTY_ROOT(&conn->calls)); - rxrpc_purge_queue(&conn->rx_queue); - - conn->security->clear(conn); - key_put(conn->key); - key_put(conn->server_key); - - rxrpc_put_transport(conn->trans); - kfree(conn); - _leave(""); -} - -/* - * reap dead connections - */ -static void rxrpc_connection_reaper(struct work_struct *work) -{ - struct rxrpc_connection *conn, *_p; - unsigned long now, earliest, reap_time; - - LIST_HEAD(graveyard); - - _enter(""); - - now = ktime_get_seconds(); - earliest = ULONG_MAX; - - write_lock_bh(&rxrpc_connection_lock); - list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) { - _debug("reap CONN %d { u=%d,t=%ld }", - conn->debug_id, atomic_read(&conn->usage), - (long) now - (long) conn->put_time); - - if (likely(atomic_read(&conn->usage) > 0)) - continue; - - spin_lock(&conn->trans->client_lock); - write_lock(&conn->trans->conn_lock); - reap_time = conn->put_time + rxrpc_connection_expiry; - - if (atomic_read(&conn->usage) > 0) { - ; - } else if (reap_time <= now) { - list_move_tail(&conn->link, &graveyard); - if (conn->out_clientflag) - rb_erase(&conn->node, - &conn->trans->client_conns); - else - rb_erase(&conn->node, - &conn->trans->server_conns); - if (conn->bundle) { - list_del_init(&conn->bundle_link); - conn->bundle->num_conns--; - } - - } else if (reap_time < earliest) { - earliest = reap_time; - } - - write_unlock(&conn->trans->conn_lock); - spin_unlock(&conn->trans->client_lock); - } - write_unlock_bh(&rxrpc_connection_lock); - - if (earliest != ULONG_MAX) { - _debug("reschedule reaper %ld", (long) earliest - now); - ASSERTCMP(earliest, >, now); - rxrpc_queue_delayed_work(&rxrpc_connection_reap, - (earliest - now) * HZ); - } - - /* then destroy all those pulled out */ - while (!list_empty(&graveyard)) { - conn = list_entry(graveyard.next, struct rxrpc_connection, - link); - list_del_init(&conn->link); - - ASSERTCMP(atomic_read(&conn->usage), ==, 0); - rxrpc_destroy_connection(conn); - } - - _leave(""); -} - -/* - * preemptively destroy all the connection records rather than waiting for them - * to time out - */ -void __exit rxrpc_destroy_all_connections(void) -{ - _enter(""); - - rxrpc_connection_expiry = 0; - cancel_delayed_work(&rxrpc_connection_reap); - rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); - - _leave(""); -} diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c deleted file mode 100644 index 8bdd692d4862..000000000000 --- a/net/rxrpc/ar-connevent.c +++ /dev/null @@ -1,403 +0,0 @@ -/* connection-level event handling - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * pass a connection-level abort onto all calls on that connection - */ -static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, - u32 abort_code) -{ - struct rxrpc_call *call; - struct rb_node *p; - - _enter("{%d},%x", conn->debug_id, abort_code); - - read_lock_bh(&conn->lock); - - for (p = rb_first(&conn->calls); p; p = rb_next(p)) { - call = rb_entry(p, struct rxrpc_call, conn_node); - write_lock(&call->state_lock); - if (call->state <= RXRPC_CALL_COMPLETE) { - call->state = state; - if (state == RXRPC_CALL_LOCALLY_ABORTED) { - call->local_abort = conn->local_abort; - set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); - } else { - call->remote_abort = conn->remote_abort; - set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); - } - rxrpc_queue_call(call); - } - write_unlock(&call->state_lock); - } - - read_unlock_bh(&conn->lock); - _leave(""); -} - -/* - * generate a connection-level abort - */ -static int rxrpc_abort_connection(struct rxrpc_connection *conn, - u32 error, u32 abort_code) -{ - struct rxrpc_wire_header whdr; - struct msghdr msg; - struct kvec iov[2]; - __be32 word; - size_t len; - u32 serial; - int ret; - - _enter("%d,,%u,%u", conn->debug_id, error, abort_code); - - /* generate a connection-level abort */ - spin_lock_bh(&conn->state_lock); - if (conn->state < RXRPC_CONN_REMOTELY_ABORTED) { - conn->state = RXRPC_CONN_LOCALLY_ABORTED; - conn->error = error; - spin_unlock_bh(&conn->state_lock); - } else { - spin_unlock_bh(&conn->state_lock); - _leave(" = 0 [already dead]"); - return 0; - } - - rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code); - - msg.msg_name = &conn->trans->peer->srx.transport; - msg.msg_namelen = conn->trans->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - whdr.epoch = htonl(conn->epoch); - whdr.cid = htonl(conn->cid); - whdr.callNumber = 0; - whdr.seq = 0; - whdr.type = RXRPC_PACKET_TYPE_ABORT; - whdr.flags = conn->out_clientflag; - whdr.userStatus = 0; - whdr.securityIndex = conn->security_ix; - whdr._rsvd = 0; - whdr.serviceId = htons(conn->service_id); - - word = htonl(conn->local_abort); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = &word; - iov[1].iov_len = sizeof(word); - - len = iov[0].iov_len + iov[1].iov_len; - - serial = atomic_inc_return(&conn->serial); - whdr.serial = htonl(serial); - _proto("Tx CONN ABORT %%%u { %d }", serial, conn->local_abort); - - ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); - if (ret < 0) { - _debug("sendmsg failed: %d", ret); - return -EAGAIN; - } - - _leave(" = 0"); - return 0; -} - -/* - * mark a call as being on a now-secured channel - * - must be called with softirqs disabled - */ -static void rxrpc_call_is_secure(struct rxrpc_call *call) -{ - _enter("%p", call); - if (call) { - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events)) - rxrpc_queue_call(call); - read_unlock(&call->state_lock); - } -} - -/* - * connection-level Rx packet processor - */ -static int rxrpc_process_event(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - __be32 wtmp; - u32 abort_code; - int loop, ret; - - if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - kleave(" = -ECONNABORTED [%u]", conn->state); - return -ECONNABORTED; - } - - _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); - - switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_ABORT: - if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) - return -EPROTO; - abort_code = ntohl(wtmp); - _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); - - conn->state = RXRPC_CONN_REMOTELY_ABORTED; - rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, - abort_code); - return -ECONNABORTED; - - case RXRPC_PACKET_TYPE_CHALLENGE: - return conn->security->respond_to_challenge(conn, skb, - _abort_code); - - case RXRPC_PACKET_TYPE_RESPONSE: - ret = conn->security->verify_response(conn, skb, _abort_code); - if (ret < 0) - return ret; - - ret = conn->security->init_connection_security(conn); - if (ret < 0) - return ret; - - conn->security->prime_packet_security(conn); - read_lock_bh(&conn->lock); - spin_lock(&conn->state_lock); - - if (conn->state == RXRPC_CONN_SERVER_CHALLENGING) { - conn->state = RXRPC_CONN_SERVER; - for (loop = 0; loop < RXRPC_MAXCALLS; loop++) - rxrpc_call_is_secure(conn->channels[loop]); - } - - spin_unlock(&conn->state_lock); - read_unlock_bh(&conn->lock); - return 0; - - default: - _leave(" = -EPROTO [%u]", sp->hdr.type); - return -EPROTO; - } -} - -/* - * set up security and issue a challenge - */ -static void rxrpc_secure_connection(struct rxrpc_connection *conn) -{ - u32 abort_code; - int ret; - - _enter("{%d}", conn->debug_id); - - ASSERT(conn->security_ix != 0); - - if (!conn->key) { - _debug("set up security"); - ret = rxrpc_init_server_conn_security(conn); - switch (ret) { - case 0: - break; - case -ENOENT: - abort_code = RX_CALL_DEAD; - goto abort; - default: - abort_code = RXKADNOAUTH; - goto abort; - } - } - - if (conn->security->issue_challenge(conn) < 0) { - abort_code = RX_CALL_DEAD; - ret = -ENOMEM; - goto abort; - } - - _leave(""); - return; - -abort: - _debug("abort %d, %d", ret, abort_code); - rxrpc_abort_connection(conn, -ret, abort_code); - _leave(" [aborted]"); -} - -/* - * connection-level event processor - */ -void rxrpc_process_connection(struct work_struct *work) -{ - struct rxrpc_connection *conn = - container_of(work, struct rxrpc_connection, processor); - struct sk_buff *skb; - u32 abort_code = RX_PROTOCOL_ERROR; - int ret; - - _enter("{%d}", conn->debug_id); - - atomic_inc(&conn->usage); - - if (test_and_clear_bit(RXRPC_CONN_CHALLENGE, &conn->events)) { - rxrpc_secure_connection(conn); - rxrpc_put_connection(conn); - } - - /* go through the conn-level event packets, releasing the ref on this - * connection that each one has when we've finished with it */ - while ((skb = skb_dequeue(&conn->rx_queue))) { - ret = rxrpc_process_event(conn, skb, &abort_code); - switch (ret) { - case -EPROTO: - case -EKEYEXPIRED: - case -EKEYREJECTED: - goto protocol_error; - case -EAGAIN: - goto requeue_and_leave; - case -ECONNABORTED: - default: - rxrpc_put_connection(conn); - rxrpc_free_skb(skb); - break; - } - } - -out: - rxrpc_put_connection(conn); - _leave(""); - return; - -requeue_and_leave: - skb_queue_head(&conn->rx_queue, skb); - goto out; - -protocol_error: - if (rxrpc_abort_connection(conn, -ret, abort_code) < 0) - goto requeue_and_leave; - rxrpc_put_connection(conn); - rxrpc_free_skb(skb); - _leave(" [EPROTO]"); - goto out; -} - -/* - * put a packet up for transport-level abort - */ -void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) -{ - CHECK_SLAB_OKAY(&local->usage); - - if (!atomic_inc_not_zero(&local->usage)) { - printk("resurrected on reject\n"); - BUG(); - } - - skb_queue_tail(&local->reject_queue, skb); - rxrpc_queue_work(&local->rejecter); -} - -/* - * reject packets through the local endpoint - */ -void rxrpc_reject_packets(struct work_struct *work) -{ - union { - struct sockaddr sa; - struct sockaddr_in sin; - } sa; - struct rxrpc_skb_priv *sp; - struct rxrpc_wire_header whdr; - struct rxrpc_local *local; - struct sk_buff *skb; - struct msghdr msg; - struct kvec iov[2]; - size_t size; - __be32 code; - - local = container_of(work, struct rxrpc_local, rejecter); - rxrpc_get_local(local); - - _enter("%d", local->debug_id); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = &code; - iov[1].iov_len = sizeof(code); - size = sizeof(whdr) + sizeof(code); - - msg.msg_name = &sa; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - memset(&sa, 0, sizeof(sa)); - sa.sa.sa_family = local->srx.transport.family; - switch (sa.sa.sa_family) { - case AF_INET: - msg.msg_namelen = sizeof(sa.sin); - break; - default: - msg.msg_namelen = 0; - break; - } - - memset(&whdr, 0, sizeof(whdr)); - whdr.type = RXRPC_PACKET_TYPE_ABORT; - - while ((skb = skb_dequeue(&local->reject_queue))) { - sp = rxrpc_skb(skb); - switch (sa.sa.sa_family) { - case AF_INET: - sa.sin.sin_port = udp_hdr(skb)->source; - sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; - code = htonl(skb->priority); - - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.serviceId = htons(sp->hdr.serviceId); - whdr.flags = sp->hdr.flags; - whdr.flags ^= RXRPC_CLIENT_INITIATED; - whdr.flags &= RXRPC_CLIENT_INITIATED; - - kernel_sendmsg(local->socket, &msg, iov, 2, size); - break; - - default: - break; - } - - rxrpc_free_skb(skb); - rxrpc_put_local(local); - } - - rxrpc_put_local(local); - _leave(""); -} diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c deleted file mode 100644 index 3e82d6f0313c..000000000000 --- a/net/rxrpc/ar-error.c +++ /dev/null @@ -1,230 +0,0 @@ -/* Error message handling (ICMP) - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * handle an error received on the local endpoint - */ -void rxrpc_UDP_error_report(struct sock *sk) -{ - struct sock_exterr_skb *serr; - struct rxrpc_transport *trans; - struct rxrpc_local *local = sk->sk_user_data; - struct rxrpc_peer *peer; - struct sk_buff *skb; - __be32 addr; - __be16 port; - - _enter("%p{%d}", sk, local->debug_id); - - skb = sock_dequeue_err_skb(sk); - if (!skb) { - _leave("UDP socket errqueue empty"); - return; - } - serr = SKB_EXT_ERR(skb); - if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { - _leave("UDP empty message"); - kfree_skb(skb); - return; - } - - rxrpc_new_skb(skb); - - addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); - port = serr->port; - - _net("Rx UDP Error from %pI4:%hu", &addr, ntohs(port)); - _debug("Msg l:%d d:%d", skb->len, skb->data_len); - - peer = rxrpc_find_peer(local, addr, port); - if (IS_ERR(peer)) { - rxrpc_free_skb(skb); - _leave(" [no peer]"); - return; - } - - trans = rxrpc_find_transport(local, peer); - if (!trans) { - rxrpc_put_peer(peer); - rxrpc_free_skb(skb); - _leave(" [no trans]"); - return; - } - - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && - serr->ee.ee_type == ICMP_DEST_UNREACH && - serr->ee.ee_code == ICMP_FRAG_NEEDED - ) { - u32 mtu = serr->ee.ee_info; - - _net("Rx Received ICMP Fragmentation Needed (%d)", mtu); - - /* wind down the local interface MTU */ - if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) { - peer->if_mtu = mtu; - _net("I/F MTU %u", mtu); - } - - if (mtu == 0) { - /* they didn't give us a size, estimate one */ - mtu = peer->if_mtu; - if (mtu > 1500) { - mtu >>= 1; - if (mtu < 1500) - mtu = 1500; - } else { - mtu -= 100; - if (mtu < peer->hdrsize) - mtu = peer->hdrsize + 4; - } - } - - if (mtu < peer->mtu) { - spin_lock_bh(&peer->lock); - peer->mtu = mtu; - peer->maxdata = peer->mtu - peer->hdrsize; - spin_unlock_bh(&peer->lock); - _net("Net MTU %u (maxdata %u)", - peer->mtu, peer->maxdata); - } - } - - rxrpc_put_peer(peer); - - /* pass the transport ref to error_handler to release */ - skb_queue_tail(&trans->error_queue, skb); - rxrpc_queue_work(&trans->error_handler); - _leave(""); -} - -/* - * deal with UDP error messages - */ -void rxrpc_UDP_error_handler(struct work_struct *work) -{ - struct sock_extended_err *ee; - struct sock_exterr_skb *serr; - struct rxrpc_transport *trans = - container_of(work, struct rxrpc_transport, error_handler); - struct sk_buff *skb; - int err; - - _enter(""); - - skb = skb_dequeue(&trans->error_queue); - if (!skb) - return; - - serr = SKB_EXT_ERR(skb); - ee = &serr->ee; - - _net("Rx Error o=%d t=%d c=%d e=%d", - ee->ee_origin, ee->ee_type, ee->ee_code, ee->ee_errno); - - err = ee->ee_errno; - - switch (ee->ee_origin) { - case SO_EE_ORIGIN_ICMP: - switch (ee->ee_type) { - case ICMP_DEST_UNREACH: - switch (ee->ee_code) { - case ICMP_NET_UNREACH: - _net("Rx Received ICMP Network Unreachable"); - break; - case ICMP_HOST_UNREACH: - _net("Rx Received ICMP Host Unreachable"); - break; - case ICMP_PORT_UNREACH: - _net("Rx Received ICMP Port Unreachable"); - break; - case ICMP_NET_UNKNOWN: - _net("Rx Received ICMP Unknown Network"); - break; - case ICMP_HOST_UNKNOWN: - _net("Rx Received ICMP Unknown Host"); - break; - default: - _net("Rx Received ICMP DestUnreach code=%u", - ee->ee_code); - break; - } - break; - - case ICMP_TIME_EXCEEDED: - _net("Rx Received ICMP TTL Exceeded"); - break; - - default: - _proto("Rx Received ICMP error { type=%u code=%u }", - ee->ee_type, ee->ee_code); - break; - } - break; - - case SO_EE_ORIGIN_LOCAL: - _proto("Rx Received local error { error=%d }", - ee->ee_errno); - break; - - case SO_EE_ORIGIN_NONE: - case SO_EE_ORIGIN_ICMP6: - default: - _proto("Rx Received error report { orig=%u }", - ee->ee_origin); - break; - } - - /* terminate all the affected calls if there's an unrecoverable - * error */ - if (err) { - struct rxrpc_call *call, *_n; - - _debug("ISSUE ERROR %d", err); - - spin_lock_bh(&trans->peer->lock); - trans->peer->net_error = err; - - list_for_each_entry_safe(call, _n, &trans->peer->error_targets, - error_link) { - write_lock(&call->state_lock); - if (call->state != RXRPC_CALL_COMPLETE && - call->state < RXRPC_CALL_NETWORK_ERROR) { - call->state = RXRPC_CALL_NETWORK_ERROR; - set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); - rxrpc_queue_call(call); - } - write_unlock(&call->state_lock); - list_del_init(&call->error_link); - } - - spin_unlock_bh(&trans->peer->lock); - } - - if (!skb_queue_empty(&trans->error_queue)) - rxrpc_queue_work(&trans->error_handler); - - rxrpc_free_skb(skb); - rxrpc_put_transport(trans); - _leave(""); -} diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c deleted file mode 100644 index e0815a033999..000000000000 --- a/net/rxrpc/ar-input.c +++ /dev/null @@ -1,800 +0,0 @@ -/* RxRPC packet reception - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * queue a packet for recvmsg to pass to userspace - * - the caller must hold a lock on call->lock - * - must not be called with interrupts disabled (sk_filter() disables BH's) - * - eats the packet whether successful or not - * - there must be just one reference to the packet, which the caller passes to - * this function - */ -int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, - bool force, bool terminal) -{ - struct rxrpc_skb_priv *sp; - struct rxrpc_sock *rx = call->socket; - struct sock *sk; - int ret; - - _enter(",,%d,%d", force, terminal); - - ASSERT(!irqs_disabled()); - - sp = rxrpc_skb(skb); - ASSERTCMP(sp->call, ==, call); - - /* if we've already posted the terminal message for a call, then we - * don't post any more */ - if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { - _debug("already terminated"); - ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE); - skb->destructor = NULL; - sp->call = NULL; - rxrpc_put_call(call); - rxrpc_free_skb(skb); - return 0; - } - - sk = &rx->sk; - - if (!force) { - /* cast skb->rcvbuf to unsigned... It's pointless, but - * reduces number of warnings when compiling with -W - * --ANK */ -// ret = -ENOBUFS; -// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= -// (unsigned int) sk->sk_rcvbuf) -// goto out; - - ret = sk_filter(sk, skb); - if (ret < 0) - goto out; - } - - spin_lock_bh(&sk->sk_receive_queue.lock); - if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) && - !test_bit(RXRPC_CALL_RELEASED, &call->flags) && - call->socket->sk.sk_state != RXRPC_CLOSE) { - skb->destructor = rxrpc_packet_destructor; - skb->dev = NULL; - skb->sk = sk; - atomic_add(skb->truesize, &sk->sk_rmem_alloc); - - if (terminal) { - _debug("<<<< TERMINAL MESSAGE >>>>"); - set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags); - } - - /* allow interception by a kernel service */ - if (rx->interceptor) { - rx->interceptor(sk, call->user_call_ID, skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - } else { - _net("post skb %p", skb); - __skb_queue_tail(&sk->sk_receive_queue, skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); - } - skb = NULL; - } else { - spin_unlock_bh(&sk->sk_receive_queue.lock); - } - ret = 0; - -out: - /* release the socket buffer */ - if (skb) { - skb->destructor = NULL; - sp->call = NULL; - rxrpc_put_call(call); - rxrpc_free_skb(skb); - } - - _leave(" = %d", ret); - return ret; -} - -/* - * process a DATA packet, posting the packet to the appropriate queue - * - eats the packet if successful - */ -static int rxrpc_fast_process_data(struct rxrpc_call *call, - struct sk_buff *skb, u32 seq) -{ - struct rxrpc_skb_priv *sp; - bool terminal; - int ret, ackbit, ack; - - _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq); - - sp = rxrpc_skb(skb); - ASSERTCMP(sp->call, ==, NULL); - - spin_lock(&call->lock); - - if (call->state > RXRPC_CALL_COMPLETE) - goto discard; - - ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post); - ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv); - ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten); - - if (seq < call->rx_data_post) { - _debug("dup #%u [-%u]", seq, call->rx_data_post); - ack = RXRPC_ACK_DUPLICATE; - ret = -ENOBUFS; - goto discard_and_ack; - } - - /* we may already have the packet in the out of sequence queue */ - ackbit = seq - (call->rx_data_eaten + 1); - ASSERTCMP(ackbit, >=, 0); - if (__test_and_set_bit(ackbit, call->ackr_window)) { - _debug("dup oos #%u [%u,%u]", - seq, call->rx_data_eaten, call->rx_data_post); - ack = RXRPC_ACK_DUPLICATE; - goto discard_and_ack; - } - - if (seq >= call->ackr_win_top) { - _debug("exceed #%u [%u]", seq, call->ackr_win_top); - __clear_bit(ackbit, call->ackr_window); - ack = RXRPC_ACK_EXCEEDS_WINDOW; - goto discard_and_ack; - } - - if (seq == call->rx_data_expect) { - clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags); - call->rx_data_expect++; - } else if (seq > call->rx_data_expect) { - _debug("oos #%u [%u]", seq, call->rx_data_expect); - call->rx_data_expect = seq + 1; - if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) { - ack = RXRPC_ACK_OUT_OF_SEQUENCE; - goto enqueue_and_ack; - } - goto enqueue_packet; - } - - if (seq != call->rx_data_post) { - _debug("ahead #%u [%u]", seq, call->rx_data_post); - goto enqueue_packet; - } - - if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags)) - goto protocol_error; - - /* if the packet need security things doing to it, then it goes down - * the slow path */ - if (call->conn->security_ix) - goto enqueue_packet; - - sp->call = call; - rxrpc_get_call(call); - terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && - !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); - ret = rxrpc_queue_rcv_skb(call, skb, false, terminal); - if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOBUFS) { - __clear_bit(ackbit, call->ackr_window); - ack = RXRPC_ACK_NOSPACE; - goto discard_and_ack; - } - goto out; - } - - skb = NULL; - - _debug("post #%u", seq); - ASSERTCMP(call->rx_data_post, ==, seq); - call->rx_data_post++; - - if (sp->hdr.flags & RXRPC_LAST_PACKET) - set_bit(RXRPC_CALL_RCVD_LAST, &call->flags); - - /* if we've reached an out of sequence packet then we need to drain - * that queue into the socket Rx queue now */ - if (call->rx_data_post == call->rx_first_oos) { - _debug("drain rx oos now"); - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) - rxrpc_queue_call(call); - read_unlock(&call->state_lock); - } - - spin_unlock(&call->lock); - atomic_inc(&call->ackr_not_idle); - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, sp->hdr.serial, false); - _leave(" = 0 [posted]"); - return 0; - -protocol_error: - ret = -EBADMSG; -out: - spin_unlock(&call->lock); - _leave(" = %d", ret); - return ret; - -discard_and_ack: - _debug("discard and ACK packet %p", skb); - __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true); -discard: - spin_unlock(&call->lock); - rxrpc_free_skb(skb); - _leave(" = 0 [discarded]"); - return 0; - -enqueue_and_ack: - __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true); -enqueue_packet: - _net("defer skb %p", skb); - spin_unlock(&call->lock); - skb_queue_tail(&call->rx_queue, skb); - atomic_inc(&call->ackr_not_idle); - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_DEAD) - rxrpc_queue_call(call); - read_unlock(&call->state_lock); - _leave(" = 0 [queued]"); - return 0; -} - -/* - * assume an implicit ACKALL of the transmission phase of a client socket upon - * reception of the first reply packet - */ -static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) -{ - write_lock_bh(&call->state_lock); - - switch (call->state) { - case RXRPC_CALL_CLIENT_AWAIT_REPLY: - call->state = RXRPC_CALL_CLIENT_RECV_REPLY; - call->acks_latest = serial; - - _debug("implicit ACKALL %%%u", call->acks_latest); - set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events); - write_unlock_bh(&call->state_lock); - - if (try_to_del_timer_sync(&call->resend_timer) >= 0) { - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_EV_RESEND, &call->events); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - } - break; - - default: - write_unlock_bh(&call->state_lock); - break; - } -} - -/* - * post an incoming packet to the nominated call to deal with - * - must get rid of the sk_buff, either by freeing it or by queuing it - */ -void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - __be32 wtmp; - u32 hi_serial, abort_code; - - _enter("%p,%p", call, skb); - - ASSERT(!irqs_disabled()); - -#if 0 // INJECT RX ERROR - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { - static int skip = 0; - if (++skip == 3) { - printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n"); - skip = 0; - goto free_packet; - } - } -#endif - - /* track the latest serial number on this connection for ACK packet - * information */ - hi_serial = atomic_read(&call->conn->hi_serial); - while (sp->hdr.serial > hi_serial) - hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial, - sp->hdr.serial); - - /* request ACK generation for any ACK or DATA packet that requests - * it */ - if (sp->hdr.flags & RXRPC_REQUEST_ACK) { - _proto("ACK Requested on %%%u", sp->hdr.serial); - rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false); - } - - switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_ABORT: - _debug("abort"); - - if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) - goto protocol_error; - - abort_code = ntohl(wtmp); - _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); - - write_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) { - call->state = RXRPC_CALL_REMOTELY_ABORTED; - call->remote_abort = abort_code; - set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); - rxrpc_queue_call(call); - } - goto free_packet_unlock; - - case RXRPC_PACKET_TYPE_BUSY: - _proto("Rx BUSY %%%u", sp->hdr.serial); - - if (call->conn->out_clientflag) - goto protocol_error; - - write_lock_bh(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_CLIENT_SEND_REQUEST: - call->state = RXRPC_CALL_SERVER_BUSY; - set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); - rxrpc_queue_call(call); - case RXRPC_CALL_SERVER_BUSY: - goto free_packet_unlock; - default: - goto protocol_error_locked; - } - - default: - _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); - goto protocol_error; - - case RXRPC_PACKET_TYPE_DATA: - _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); - - if (sp->hdr.seq == 0) - goto protocol_error; - - call->ackr_prev_seq = sp->hdr.seq; - - /* received data implicitly ACKs all of the request packets we - * sent when we're acting as a client */ - if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) - rxrpc_assume_implicit_ackall(call, sp->hdr.serial); - - switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) { - case 0: - skb = NULL; - goto done; - - default: - BUG(); - - /* data packet received beyond the last packet */ - case -EBADMSG: - goto protocol_error; - } - - case RXRPC_PACKET_TYPE_ACKALL: - case RXRPC_PACKET_TYPE_ACK: - /* ACK processing is done in process context */ - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_DEAD) { - skb_queue_tail(&call->rx_queue, skb); - rxrpc_queue_call(call); - skb = NULL; - } - read_unlock_bh(&call->state_lock); - goto free_packet; - } - -protocol_error: - _debug("protocol error"); - write_lock_bh(&call->state_lock); -protocol_error_locked: - if (call->state <= RXRPC_CALL_COMPLETE) { - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = RX_PROTOCOL_ERROR; - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - rxrpc_queue_call(call); - } -free_packet_unlock: - write_unlock_bh(&call->state_lock); -free_packet: - rxrpc_free_skb(skb); -done: - _leave(""); -} - -/* - * split up a jumbo data packet - */ -static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, - struct sk_buff *jumbo) -{ - struct rxrpc_jumbo_header jhdr; - struct rxrpc_skb_priv *sp; - struct sk_buff *part; - - _enter(",{%u,%u}", jumbo->data_len, jumbo->len); - - sp = rxrpc_skb(jumbo); - - do { - sp->hdr.flags &= ~RXRPC_JUMBO_PACKET; - - /* make a clone to represent the first subpacket in what's left - * of the jumbo packet */ - part = skb_clone(jumbo, GFP_ATOMIC); - if (!part) { - /* simply ditch the tail in the event of ENOMEM */ - pskb_trim(jumbo, RXRPC_JUMBO_DATALEN); - break; - } - rxrpc_new_skb(part); - - pskb_trim(part, RXRPC_JUMBO_DATALEN); - - if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN)) - goto protocol_error; - - if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0) - goto protocol_error; - if (!pskb_pull(jumbo, sizeof(jhdr))) - BUG(); - - sp->hdr.seq += 1; - sp->hdr.serial += 1; - sp->hdr.flags = jhdr.flags; - sp->hdr._rsvd = jhdr._rsvd; - - _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1); - - rxrpc_fast_process_packet(call, part); - part = NULL; - - } while (sp->hdr.flags & RXRPC_JUMBO_PACKET); - - rxrpc_fast_process_packet(call, jumbo); - _leave(""); - return; - -protocol_error: - _debug("protocol error"); - rxrpc_free_skb(part); - rxrpc_free_skb(jumbo); - write_lock_bh(&call->state_lock); - if (call->state <= RXRPC_CALL_COMPLETE) { - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = RX_PROTOCOL_ERROR; - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - rxrpc_queue_call(call); - } - write_unlock_bh(&call->state_lock); - _leave(""); -} - -/* - * post an incoming packet to the appropriate call/socket to deal with - * - must get rid of the sk_buff, either by freeing it or by queuing it - */ -static void rxrpc_post_packet_to_call(struct rxrpc_call *call, - struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp; - - _enter("%p,%p", call, skb); - - sp = rxrpc_skb(skb); - - _debug("extant call [%d]", call->state); - - read_lock(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) { - rxrpc_queue_call(call); - goto free_unlock; - } - case RXRPC_CALL_REMOTELY_ABORTED: - case RXRPC_CALL_NETWORK_ERROR: - case RXRPC_CALL_DEAD: - goto dead_call; - case RXRPC_CALL_COMPLETE: - case RXRPC_CALL_CLIENT_FINAL_ACK: - /* complete server call */ - if (call->conn->in_clientflag) - goto dead_call; - /* resend last packet of a completed call */ - _debug("final ack again"); - rxrpc_get_call(call); - set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); - rxrpc_queue_call(call); - goto free_unlock; - default: - break; - } - - read_unlock(&call->state_lock); - rxrpc_get_call(call); - - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && - sp->hdr.flags & RXRPC_JUMBO_PACKET) - rxrpc_process_jumbo_packet(call, skb); - else - rxrpc_fast_process_packet(call, skb); - - rxrpc_put_call(call); - goto done; - -dead_call: - if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { - skb->priority = RX_CALL_DEAD; - rxrpc_reject_packet(call->conn->trans->local, skb); - goto unlock; - } -free_unlock: - rxrpc_free_skb(skb); -unlock: - read_unlock(&call->state_lock); -done: - _leave(""); -} - -/* - * post connection-level events to the connection - * - this includes challenges, responses and some aborts - */ -static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, - struct sk_buff *skb) -{ - _enter("%p,%p", conn, skb); - - atomic_inc(&conn->usage); - skb_queue_tail(&conn->rx_queue, skb); - rxrpc_queue_conn(conn); -} - -/* - * post endpoint-level events to the local endpoint - * - this includes debug and version messages - */ -static void rxrpc_post_packet_to_local(struct rxrpc_local *local, - struct sk_buff *skb) -{ - _enter("%p,%p", local, skb); - - atomic_inc(&local->usage); - skb_queue_tail(&local->event_queue, skb); - rxrpc_queue_work(&local->event_processor); -} - -/* - * Extract the wire header from a packet and translate the byte order. - */ -static noinline -int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) -{ - struct rxrpc_wire_header whdr; - - /* dig out the RxRPC connection details */ - if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) - return -EBADMSG; - if (!pskb_pull(skb, sizeof(whdr))) - BUG(); - - memset(sp, 0, sizeof(*sp)); - sp->hdr.epoch = ntohl(whdr.epoch); - sp->hdr.cid = ntohl(whdr.cid); - sp->hdr.callNumber = ntohl(whdr.callNumber); - sp->hdr.seq = ntohl(whdr.seq); - sp->hdr.serial = ntohl(whdr.serial); - sp->hdr.flags = whdr.flags; - sp->hdr.type = whdr.type; - sp->hdr.userStatus = whdr.userStatus; - sp->hdr.securityIndex = whdr.securityIndex; - sp->hdr._rsvd = ntohs(whdr._rsvd); - sp->hdr.serviceId = ntohs(whdr.serviceId); - return 0; -} - -static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, - struct sk_buff *skb, - struct rxrpc_skb_priv *sp) -{ - struct rxrpc_peer *peer; - struct rxrpc_transport *trans; - struct rxrpc_connection *conn; - - peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr, - udp_hdr(skb)->source); - if (IS_ERR(peer)) - goto cant_find_conn; - - trans = rxrpc_find_transport(local, peer); - rxrpc_put_peer(peer); - if (!trans) - goto cant_find_conn; - - conn = rxrpc_find_connection(trans, &sp->hdr); - rxrpc_put_transport(trans); - if (!conn) - goto cant_find_conn; - - return conn; -cant_find_conn: - return NULL; -} - -/* - * handle data received on the local endpoint - * - may be called in interrupt context - */ -void rxrpc_data_ready(struct sock *sk) -{ - struct rxrpc_skb_priv *sp; - struct rxrpc_local *local; - struct sk_buff *skb; - int ret; - - _enter("%p", sk); - - ASSERT(!irqs_disabled()); - - read_lock_bh(&rxrpc_local_lock); - local = sk->sk_user_data; - if (local && atomic_read(&local->usage) > 0) - rxrpc_get_local(local); - else - local = NULL; - read_unlock_bh(&rxrpc_local_lock); - if (!local) { - _leave(" [local dead]"); - return; - } - - skb = skb_recv_datagram(sk, 0, 1, &ret); - if (!skb) { - rxrpc_put_local(local); - if (ret == -EAGAIN) - return; - _debug("UDP socket error %d", ret); - return; - } - - rxrpc_new_skb(skb); - - _net("recv skb %p", skb); - - /* we'll probably need to checksum it (didn't call sock_recvmsg) */ - if (skb_checksum_complete(skb)) { - rxrpc_free_skb(skb); - rxrpc_put_local(local); - __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0); - _leave(" [CSUM failed]"); - return; - } - - __UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0); - - /* The socket buffer we have is owned by UDP, with UDP's data all over - * it, but we really want our own data there. - */ - skb_orphan(skb); - sp = rxrpc_skb(skb); - - _net("Rx UDP packet from %08x:%04hu", - ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source)); - - /* dig out the RxRPC connection details */ - if (rxrpc_extract_header(sp, skb) < 0) - goto bad_message; - - _net("Rx RxRPC %s ep=%x call=%x:%x", - sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", - sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber); - - if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || - !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { - _proto("Rx Bad Packet Type %u", sp->hdr.type); - goto bad_message; - } - - if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) { - rxrpc_post_packet_to_local(local, skb); - goto out; - } - - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && - (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) - goto bad_message; - - if (sp->hdr.callNumber == 0) { - /* This is a connection-level packet. These should be - * fairly rare, so the extra overhead of looking them up the - * old-fashioned way doesn't really hurt */ - struct rxrpc_connection *conn; - - conn = rxrpc_conn_from_local(local, skb, sp); - if (!conn) - goto cant_route_call; - - _debug("CONN %p {%d}", conn, conn->debug_id); - rxrpc_post_packet_to_conn(conn, skb); - rxrpc_put_connection(conn); - } else { - struct rxrpc_call *call; - - call = rxrpc_find_call_hash(&sp->hdr, local, - AF_INET, &ip_hdr(skb)->saddr); - if (call) - rxrpc_post_packet_to_call(call, skb); - else - goto cant_route_call; - } - -out: - rxrpc_put_local(local); - return; - -cant_route_call: - _debug("can't route call"); - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && - sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { - if (sp->hdr.seq == 1) { - _debug("first packet"); - skb_queue_tail(&local->accept_queue, skb); - rxrpc_queue_work(&local->acceptor); - rxrpc_put_local(local); - _leave(" [incoming]"); - return; - } - skb->priority = RX_INVALID_OPERATION; - } else { - skb->priority = RX_CALL_DEAD; - } - - if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { - _debug("reject type %d",sp->hdr.type); - rxrpc_reject_packet(local, skb); - } - rxrpc_put_local(local); - _leave(" [no call]"); - return; - -bad_message: - skb->priority = RX_PROTOCOL_ERROR; - rxrpc_reject_packet(local, skb); - rxrpc_put_local(local); - _leave(" [badmsg]"); -} diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c deleted file mode 100644 index 4ad56fafe3a7..000000000000 --- a/net/rxrpc/ar-key.c +++ /dev/null @@ -1,1237 +0,0 @@ -/* RxRPC key management - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * RxRPC keys should have a description of describing their purpose: - * "afs@CAMBRIDGE.REDHAT.COM> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -static int rxrpc_vet_description_s(const char *); -static int rxrpc_preparse(struct key_preparsed_payload *); -static int rxrpc_preparse_s(struct key_preparsed_payload *); -static void rxrpc_free_preparse(struct key_preparsed_payload *); -static void rxrpc_free_preparse_s(struct key_preparsed_payload *); -static void rxrpc_destroy(struct key *); -static void rxrpc_destroy_s(struct key *); -static void rxrpc_describe(const struct key *, struct seq_file *); -static long rxrpc_read(const struct key *, char __user *, size_t); - -/* - * rxrpc defined keys take an arbitrary string as the description and an - * arbitrary blob of data as the payload - */ -struct key_type key_type_rxrpc = { - .name = "rxrpc", - .preparse = rxrpc_preparse, - .free_preparse = rxrpc_free_preparse, - .instantiate = generic_key_instantiate, - .destroy = rxrpc_destroy, - .describe = rxrpc_describe, - .read = rxrpc_read, -}; -EXPORT_SYMBOL(key_type_rxrpc); - -/* - * rxrpc server defined keys take ":" as the - * description and an 8-byte decryption key as the payload - */ -struct key_type key_type_rxrpc_s = { - .name = "rxrpc_s", - .vet_description = rxrpc_vet_description_s, - .preparse = rxrpc_preparse_s, - .free_preparse = rxrpc_free_preparse_s, - .instantiate = generic_key_instantiate, - .destroy = rxrpc_destroy_s, - .describe = rxrpc_describe, -}; - -/* - * Vet the description for an RxRPC server key - */ -static int rxrpc_vet_description_s(const char *desc) -{ - unsigned long num; - char *p; - - num = simple_strtoul(desc, &p, 10); - if (*p != ':' || num > 65535) - return -EINVAL; - num = simple_strtoul(p + 1, &p, 10); - if (*p || num < 1 || num > 255) - return -EINVAL; - return 0; -} - -/* - * parse an RxKAD type XDR format token - * - the caller guarantees we have at least 4 words - */ -static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, - size_t datalen, - const __be32 *xdr, unsigned int toklen) -{ - struct rxrpc_key_token *token, **pptoken; - size_t plen; - u32 tktlen; - - _enter(",{%x,%x,%x,%x},%u", - ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), - toklen); - - if (toklen <= 8 * 4) - return -EKEYREJECTED; - tktlen = ntohl(xdr[7]); - _debug("tktlen: %x", tktlen); - if (tktlen > AFSTOKEN_RK_TIX_MAX) - return -EKEYREJECTED; - if (toklen < 8 * 4 + tktlen) - return -EKEYREJECTED; - - plen = sizeof(*token) + sizeof(*token->kad) + tktlen; - prep->quotalen = datalen + plen; - - plen -= sizeof(*token); - token = kzalloc(sizeof(*token), GFP_KERNEL); - if (!token) - return -ENOMEM; - - token->kad = kzalloc(plen, GFP_KERNEL); - if (!token->kad) { - kfree(token); - return -ENOMEM; - } - - token->security_index = RXRPC_SECURITY_RXKAD; - token->kad->ticket_len = tktlen; - token->kad->vice_id = ntohl(xdr[0]); - token->kad->kvno = ntohl(xdr[1]); - token->kad->start = ntohl(xdr[4]); - token->kad->expiry = ntohl(xdr[5]); - token->kad->primary_flag = ntohl(xdr[6]); - memcpy(&token->kad->session_key, &xdr[2], 8); - memcpy(&token->kad->ticket, &xdr[8], tktlen); - - _debug("SCIX: %u", token->security_index); - _debug("TLEN: %u", token->kad->ticket_len); - _debug("EXPY: %x", token->kad->expiry); - _debug("KVNO: %u", token->kad->kvno); - _debug("PRIM: %u", token->kad->primary_flag); - _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", - token->kad->session_key[0], token->kad->session_key[1], - token->kad->session_key[2], token->kad->session_key[3], - token->kad->session_key[4], token->kad->session_key[5], - token->kad->session_key[6], token->kad->session_key[7]); - if (token->kad->ticket_len >= 8) - _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", - token->kad->ticket[0], token->kad->ticket[1], - token->kad->ticket[2], token->kad->ticket[3], - token->kad->ticket[4], token->kad->ticket[5], - token->kad->ticket[6], token->kad->ticket[7]); - - /* count the number of tokens attached */ - prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); - - /* attach the data */ - for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; - *pptoken; - pptoken = &(*pptoken)->next) - continue; - *pptoken = token; - if (token->kad->expiry < prep->expiry) - prep->expiry = token->kad->expiry; - - _leave(" = 0"); - return 0; -} - -static void rxrpc_free_krb5_principal(struct krb5_principal *princ) -{ - int loop; - - if (princ->name_parts) { - for (loop = princ->n_name_parts - 1; loop >= 0; loop--) - kfree(princ->name_parts[loop]); - kfree(princ->name_parts); - } - kfree(princ->realm); -} - -static void rxrpc_free_krb5_tagged(struct krb5_tagged_data *td) -{ - kfree(td->data); -} - -/* - * free up an RxK5 token - */ -static void rxrpc_rxk5_free(struct rxk5_key *rxk5) -{ - int loop; - - rxrpc_free_krb5_principal(&rxk5->client); - rxrpc_free_krb5_principal(&rxk5->server); - rxrpc_free_krb5_tagged(&rxk5->session); - - if (rxk5->addresses) { - for (loop = rxk5->n_addresses - 1; loop >= 0; loop--) - rxrpc_free_krb5_tagged(&rxk5->addresses[loop]); - kfree(rxk5->addresses); - } - if (rxk5->authdata) { - for (loop = rxk5->n_authdata - 1; loop >= 0; loop--) - rxrpc_free_krb5_tagged(&rxk5->authdata[loop]); - kfree(rxk5->authdata); - } - - kfree(rxk5->ticket); - kfree(rxk5->ticket2); - kfree(rxk5); -} - -/* - * extract a krb5 principal - */ -static int rxrpc_krb5_decode_principal(struct krb5_principal *princ, - const __be32 **_xdr, - unsigned int *_toklen) -{ - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, n_parts, loop, tmp; - - /* there must be at least one name, and at least #names+1 length - * words */ - if (toklen <= 12) - return -EINVAL; - - _enter(",{%x,%x,%x},%u", - ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), toklen); - - n_parts = ntohl(*xdr++); - toklen -= 4; - if (n_parts <= 0 || n_parts > AFSTOKEN_K5_COMPONENTS_MAX) - return -EINVAL; - princ->n_name_parts = n_parts; - - if (toklen <= (n_parts + 1) * 4) - return -EINVAL; - - princ->name_parts = kcalloc(n_parts, sizeof(char *), GFP_KERNEL); - if (!princ->name_parts) - return -ENOMEM; - - for (loop = 0; loop < n_parts; loop++) { - if (toklen < 4) - return -EINVAL; - tmp = ntohl(*xdr++); - toklen -= 4; - if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX) - return -EINVAL; - if (tmp > toklen) - return -EINVAL; - princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL); - if (!princ->name_parts[loop]) - return -ENOMEM; - memcpy(princ->name_parts[loop], xdr, tmp); - princ->name_parts[loop][tmp] = 0; - tmp = (tmp + 3) & ~3; - toklen -= tmp; - xdr += tmp >> 2; - } - - if (toklen < 4) - return -EINVAL; - tmp = ntohl(*xdr++); - toklen -= 4; - if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX) - return -EINVAL; - if (tmp > toklen) - return -EINVAL; - princ->realm = kmalloc(tmp + 1, GFP_KERNEL); - if (!princ->realm) - return -ENOMEM; - memcpy(princ->realm, xdr, tmp); - princ->realm[tmp] = 0; - tmp = (tmp + 3) & ~3; - toklen -= tmp; - xdr += tmp >> 2; - - _debug("%s/...@%s", princ->name_parts[0], princ->realm); - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * extract a piece of krb5 tagged data - */ -static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td, - size_t max_data_size, - const __be32 **_xdr, - unsigned int *_toklen) -{ - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, len; - - /* there must be at least one tag and one length word */ - if (toklen <= 8) - return -EINVAL; - - _enter(",%zu,{%x,%x},%u", - max_data_size, ntohl(xdr[0]), ntohl(xdr[1]), toklen); - - td->tag = ntohl(*xdr++); - len = ntohl(*xdr++); - toklen -= 8; - if (len > max_data_size) - return -EINVAL; - td->data_len = len; - - if (len > 0) { - td->data = kmemdup(xdr, len, GFP_KERNEL); - if (!td->data) - return -ENOMEM; - len = (len + 3) & ~3; - toklen -= len; - xdr += len >> 2; - } - - _debug("tag %x len %x", td->tag, td->data_len); - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * extract an array of tagged data - */ -static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td, - u8 *_n_elem, - u8 max_n_elem, - size_t max_elem_size, - const __be32 **_xdr, - unsigned int *_toklen) -{ - struct krb5_tagged_data *td; - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, n_elem, loop; - int ret; - - /* there must be at least one count */ - if (toklen < 4) - return -EINVAL; - - _enter(",,%u,%zu,{%x},%u", - max_n_elem, max_elem_size, ntohl(xdr[0]), toklen); - - n_elem = ntohl(*xdr++); - toklen -= 4; - if (n_elem > max_n_elem) - return -EINVAL; - *_n_elem = n_elem; - if (n_elem > 0) { - if (toklen <= (n_elem + 1) * 4) - return -EINVAL; - - _debug("n_elem %d", n_elem); - - td = kcalloc(n_elem, sizeof(struct krb5_tagged_data), - GFP_KERNEL); - if (!td) - return -ENOMEM; - *_td = td; - - for (loop = 0; loop < n_elem; loop++) { - ret = rxrpc_krb5_decode_tagged_data(&td[loop], - max_elem_size, - &xdr, &toklen); - if (ret < 0) - return ret; - } - } - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * extract a krb5 ticket - */ -static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen, - const __be32 **_xdr, unsigned int *_toklen) -{ - const __be32 *xdr = *_xdr; - unsigned int toklen = *_toklen, len; - - /* there must be at least one length word */ - if (toklen <= 4) - return -EINVAL; - - _enter(",{%x},%u", ntohl(xdr[0]), toklen); - - len = ntohl(*xdr++); - toklen -= 4; - if (len > AFSTOKEN_K5_TIX_MAX) - return -EINVAL; - *_tktlen = len; - - _debug("ticket len %u", len); - - if (len > 0) { - *_ticket = kmemdup(xdr, len, GFP_KERNEL); - if (!*_ticket) - return -ENOMEM; - len = (len + 3) & ~3; - toklen -= len; - xdr += len >> 2; - } - - *_xdr = xdr; - *_toklen = toklen; - _leave(" = 0 [toklen=%u]", toklen); - return 0; -} - -/* - * parse an RxK5 type XDR format token - * - the caller guarantees we have at least 4 words - */ -static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep, - size_t datalen, - const __be32 *xdr, unsigned int toklen) -{ - struct rxrpc_key_token *token, **pptoken; - struct rxk5_key *rxk5; - const __be32 *end_xdr = xdr + (toklen >> 2); - int ret; - - _enter(",{%x,%x,%x,%x},%u", - ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), - toklen); - - /* reserve some payload space for this subkey - the length of the token - * is a reasonable approximation */ - prep->quotalen = datalen + toklen; - - token = kzalloc(sizeof(*token), GFP_KERNEL); - if (!token) - return -ENOMEM; - - rxk5 = kzalloc(sizeof(*rxk5), GFP_KERNEL); - if (!rxk5) { - kfree(token); - return -ENOMEM; - } - - token->security_index = RXRPC_SECURITY_RXK5; - token->k5 = rxk5; - - /* extract the principals */ - ret = rxrpc_krb5_decode_principal(&rxk5->client, &xdr, &toklen); - if (ret < 0) - goto error; - ret = rxrpc_krb5_decode_principal(&rxk5->server, &xdr, &toklen); - if (ret < 0) - goto error; - - /* extract the session key and the encoding type (the tag field -> - * ENCTYPE_xxx) */ - ret = rxrpc_krb5_decode_tagged_data(&rxk5->session, AFSTOKEN_DATA_MAX, - &xdr, &toklen); - if (ret < 0) - goto error; - - if (toklen < 4 * 8 + 2 * 4) - goto inval; - rxk5->authtime = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->starttime = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->endtime = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->renew_till = be64_to_cpup((const __be64 *) xdr); - xdr += 2; - rxk5->is_skey = ntohl(*xdr++); - rxk5->flags = ntohl(*xdr++); - toklen -= 4 * 8 + 2 * 4; - - _debug("times: a=%llx s=%llx e=%llx rt=%llx", - rxk5->authtime, rxk5->starttime, rxk5->endtime, - rxk5->renew_till); - _debug("is_skey=%x flags=%x", rxk5->is_skey, rxk5->flags); - - /* extract the permitted client addresses */ - ret = rxrpc_krb5_decode_tagged_array(&rxk5->addresses, - &rxk5->n_addresses, - AFSTOKEN_K5_ADDRESSES_MAX, - AFSTOKEN_DATA_MAX, - &xdr, &toklen); - if (ret < 0) - goto error; - - ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); - - /* extract the tickets */ - ret = rxrpc_krb5_decode_ticket(&rxk5->ticket, &rxk5->ticket_len, - &xdr, &toklen); - if (ret < 0) - goto error; - ret = rxrpc_krb5_decode_ticket(&rxk5->ticket2, &rxk5->ticket2_len, - &xdr, &toklen); - if (ret < 0) - goto error; - - ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); - - /* extract the typed auth data */ - ret = rxrpc_krb5_decode_tagged_array(&rxk5->authdata, - &rxk5->n_authdata, - AFSTOKEN_K5_AUTHDATA_MAX, - AFSTOKEN_BDATALN_MAX, - &xdr, &toklen); - if (ret < 0) - goto error; - - ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); - - if (toklen != 0) - goto inval; - - /* attach the payload */ - for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; - *pptoken; - pptoken = &(*pptoken)->next) - continue; - *pptoken = token; - if (token->kad->expiry < prep->expiry) - prep->expiry = token->kad->expiry; - - _leave(" = 0"); - return 0; - -inval: - ret = -EINVAL; -error: - rxrpc_rxk5_free(rxk5); - kfree(token); - _leave(" = %d", ret); - return ret; -} - -/* - * attempt to parse the data as the XDR format - * - the caller guarantees we have more than 7 words - */ -static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) -{ - const __be32 *xdr = prep->data, *token; - const char *cp; - unsigned int len, tmp, loop, ntoken, toklen, sec_ix; - size_t datalen = prep->datalen; - int ret; - - _enter(",{%x,%x,%x,%x},%zu", - ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), - prep->datalen); - - if (datalen > AFSTOKEN_LENGTH_MAX) - goto not_xdr; - - /* XDR is an array of __be32's */ - if (datalen & 3) - goto not_xdr; - - /* the flags should be 0 (the setpag bit must be handled by - * userspace) */ - if (ntohl(*xdr++) != 0) - goto not_xdr; - datalen -= 4; - - /* check the cell name */ - len = ntohl(*xdr++); - if (len < 1 || len > AFSTOKEN_CELL_MAX) - goto not_xdr; - datalen -= 4; - tmp = (len + 3) & ~3; - if (tmp > datalen) - goto not_xdr; - - cp = (const char *) xdr; - for (loop = 0; loop < len; loop++) - if (!isprint(cp[loop])) - goto not_xdr; - if (len < tmp) - for (; loop < tmp; loop++) - if (cp[loop]) - goto not_xdr; - _debug("cellname: [%u/%u] '%*.*s'", - len, tmp, len, len, (const char *) xdr); - datalen -= tmp; - xdr += tmp >> 2; - - /* get the token count */ - if (datalen < 12) - goto not_xdr; - ntoken = ntohl(*xdr++); - datalen -= 4; - _debug("ntoken: %x", ntoken); - if (ntoken < 1 || ntoken > AFSTOKEN_MAX) - goto not_xdr; - - /* check each token wrapper */ - token = xdr; - loop = ntoken; - do { - if (datalen < 8) - goto not_xdr; - toklen = ntohl(*xdr++); - sec_ix = ntohl(*xdr); - datalen -= 4; - _debug("token: [%x/%zx] %x", toklen, datalen, sec_ix); - if (toklen < 20 || toklen > datalen) - goto not_xdr; - datalen -= (toklen + 3) & ~3; - xdr += (toklen + 3) >> 2; - - } while (--loop > 0); - - _debug("remainder: %zu", datalen); - if (datalen != 0) - goto not_xdr; - - /* okay: we're going to assume it's valid XDR format - * - we ignore the cellname, relying on the key to be correctly named - */ - do { - xdr = token; - toklen = ntohl(*xdr++); - token = xdr + ((toklen + 3) >> 2); - sec_ix = ntohl(*xdr++); - toklen -= 4; - - _debug("TOKEN type=%u [%p-%p]", sec_ix, xdr, token); - - switch (sec_ix) { - case RXRPC_SECURITY_RXKAD: - ret = rxrpc_preparse_xdr_rxkad(prep, datalen, xdr, toklen); - if (ret != 0) - goto error; - break; - - case RXRPC_SECURITY_RXK5: - ret = rxrpc_preparse_xdr_rxk5(prep, datalen, xdr, toklen); - if (ret != 0) - goto error; - break; - - default: - ret = -EPROTONOSUPPORT; - goto error; - } - - } while (--ntoken > 0); - - _leave(" = 0"); - return 0; - -not_xdr: - _leave(" = -EPROTO"); - return -EPROTO; -error: - _leave(" = %d", ret); - return ret; -} - -/* - * Preparse an rxrpc defined key. - * - * Data should be of the form: - * OFFSET LEN CONTENT - * 0 4 key interface version number - * 4 2 security index (type) - * 6 2 ticket length - * 8 4 key expiry time (time_t) - * 12 4 kvno - * 16 8 session key - * 24 [len] ticket - * - * if no data is provided, then a no-security key is made - */ -static int rxrpc_preparse(struct key_preparsed_payload *prep) -{ - const struct rxrpc_key_data_v1 *v1; - struct rxrpc_key_token *token, **pp; - size_t plen; - u32 kver; - int ret; - - _enter("%zu", prep->datalen); - - /* handle a no-security key */ - if (!prep->data && prep->datalen == 0) - return 0; - - /* determine if the XDR payload format is being used */ - if (prep->datalen > 7 * 4) { - ret = rxrpc_preparse_xdr(prep); - if (ret != -EPROTO) - return ret; - } - - /* get the key interface version number */ - ret = -EINVAL; - if (prep->datalen <= 4 || !prep->data) - goto error; - memcpy(&kver, prep->data, sizeof(kver)); - prep->data += sizeof(kver); - prep->datalen -= sizeof(kver); - - _debug("KEY I/F VERSION: %u", kver); - - ret = -EKEYREJECTED; - if (kver != 1) - goto error; - - /* deal with a version 1 key */ - ret = -EINVAL; - if (prep->datalen < sizeof(*v1)) - goto error; - - v1 = prep->data; - if (prep->datalen != sizeof(*v1) + v1->ticket_length) - goto error; - - _debug("SCIX: %u", v1->security_index); - _debug("TLEN: %u", v1->ticket_length); - _debug("EXPY: %x", v1->expiry); - _debug("KVNO: %u", v1->kvno); - _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", - v1->session_key[0], v1->session_key[1], - v1->session_key[2], v1->session_key[3], - v1->session_key[4], v1->session_key[5], - v1->session_key[6], v1->session_key[7]); - if (v1->ticket_length >= 8) - _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", - v1->ticket[0], v1->ticket[1], - v1->ticket[2], v1->ticket[3], - v1->ticket[4], v1->ticket[5], - v1->ticket[6], v1->ticket[7]); - - ret = -EPROTONOSUPPORT; - if (v1->security_index != RXRPC_SECURITY_RXKAD) - goto error; - - plen = sizeof(*token->kad) + v1->ticket_length; - prep->quotalen = plen + sizeof(*token); - - ret = -ENOMEM; - token = kzalloc(sizeof(*token), GFP_KERNEL); - if (!token) - goto error; - token->kad = kzalloc(plen, GFP_KERNEL); - if (!token->kad) - goto error_free; - - token->security_index = RXRPC_SECURITY_RXKAD; - token->kad->ticket_len = v1->ticket_length; - token->kad->expiry = v1->expiry; - token->kad->kvno = v1->kvno; - memcpy(&token->kad->session_key, &v1->session_key, 8); - memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length); - - /* count the number of tokens attached */ - prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); - - /* attach the data */ - pp = (struct rxrpc_key_token **)&prep->payload.data[0]; - while (*pp) - pp = &(*pp)->next; - *pp = token; - if (token->kad->expiry < prep->expiry) - prep->expiry = token->kad->expiry; - token = NULL; - ret = 0; - -error_free: - kfree(token); -error: - return ret; -} - -/* - * Free token list. - */ -static void rxrpc_free_token_list(struct rxrpc_key_token *token) -{ - struct rxrpc_key_token *next; - - for (; token; token = next) { - next = token->next; - switch (token->security_index) { - case RXRPC_SECURITY_RXKAD: - kfree(token->kad); - break; - case RXRPC_SECURITY_RXK5: - if (token->k5) - rxrpc_rxk5_free(token->k5); - break; - default: - pr_err("Unknown token type %x on rxrpc key\n", - token->security_index); - BUG(); - } - - kfree(token); - } -} - -/* - * Clean up preparse data. - */ -static void rxrpc_free_preparse(struct key_preparsed_payload *prep) -{ - rxrpc_free_token_list(prep->payload.data[0]); -} - -/* - * Preparse a server secret key. - * - * The data should be the 8-byte secret key. - */ -static int rxrpc_preparse_s(struct key_preparsed_payload *prep) -{ - struct crypto_skcipher *ci; - - _enter("%zu", prep->datalen); - - if (prep->datalen != 8) - return -EINVAL; - - memcpy(&prep->payload.data[2], prep->data, 8); - - ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(ci)) { - _leave(" = %ld", PTR_ERR(ci)); - return PTR_ERR(ci); - } - - if (crypto_skcipher_setkey(ci, prep->data, 8) < 0) - BUG(); - - prep->payload.data[0] = ci; - _leave(" = 0"); - return 0; -} - -/* - * Clean up preparse data. - */ -static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) -{ - if (prep->payload.data[0]) - crypto_free_skcipher(prep->payload.data[0]); -} - -/* - * dispose of the data dangling from the corpse of a rxrpc key - */ -static void rxrpc_destroy(struct key *key) -{ - rxrpc_free_token_list(key->payload.data[0]); -} - -/* - * dispose of the data dangling from the corpse of a rxrpc key - */ -static void rxrpc_destroy_s(struct key *key) -{ - if (key->payload.data[0]) { - crypto_free_skcipher(key->payload.data[0]); - key->payload.data[0] = NULL; - } -} - -/* - * describe the rxrpc key - */ -static void rxrpc_describe(const struct key *key, struct seq_file *m) -{ - seq_puts(m, key->description); -} - -/* - * grab the security key for a socket - */ -int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) -{ - struct key *key; - char *description; - - _enter(""); - - if (optlen <= 0 || optlen > PAGE_SIZE - 1) - return -EINVAL; - - description = memdup_user_nul(optval, optlen); - if (IS_ERR(description)) - return PTR_ERR(description); - - key = request_key(&key_type_rxrpc, description, NULL); - if (IS_ERR(key)) { - kfree(description); - _leave(" = %ld", PTR_ERR(key)); - return PTR_ERR(key); - } - - rx->key = key; - kfree(description); - _leave(" = 0 [key %x]", key->serial); - return 0; -} - -/* - * grab the security keyring for a server socket - */ -int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, - int optlen) -{ - struct key *key; - char *description; - - _enter(""); - - if (optlen <= 0 || optlen > PAGE_SIZE - 1) - return -EINVAL; - - description = memdup_user_nul(optval, optlen); - if (IS_ERR(description)) - return PTR_ERR(description); - - key = request_key(&key_type_keyring, description, NULL); - if (IS_ERR(key)) { - kfree(description); - _leave(" = %ld", PTR_ERR(key)); - return PTR_ERR(key); - } - - rx->securities = key; - kfree(description); - _leave(" = 0 [key %x]", key->serial); - return 0; -} - -/* - * generate a server data key - */ -int rxrpc_get_server_data_key(struct rxrpc_connection *conn, - const void *session_key, - time_t expiry, - u32 kvno) -{ - const struct cred *cred = current_cred(); - struct key *key; - int ret; - - struct { - u32 kver; - struct rxrpc_key_data_v1 v1; - } data; - - _enter(""); - - key = key_alloc(&key_type_rxrpc, "x", - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0, - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(key)) { - _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); - return -ENOMEM; - } - - _debug("key %d", key_serial(key)); - - data.kver = 1; - data.v1.security_index = RXRPC_SECURITY_RXKAD; - data.v1.ticket_length = 0; - data.v1.expiry = expiry; - data.v1.kvno = 0; - - memcpy(&data.v1.session_key, session_key, sizeof(data.v1.session_key)); - - ret = key_instantiate_and_link(key, &data, sizeof(data), NULL, NULL); - if (ret < 0) - goto error; - - conn->key = key; - _leave(" = 0 [%d]", key_serial(key)); - return 0; - -error: - key_revoke(key); - key_put(key); - _leave(" = -ENOMEM [ins %d]", ret); - return -ENOMEM; -} -EXPORT_SYMBOL(rxrpc_get_server_data_key); - -/** - * rxrpc_get_null_key - Generate a null RxRPC key - * @keyname: The name to give the key. - * - * Generate a null RxRPC key that can be used to indicate anonymous security is - * required for a particular domain. - */ -struct key *rxrpc_get_null_key(const char *keyname) -{ - const struct cred *cred = current_cred(); - struct key *key; - int ret; - - key = key_alloc(&key_type_rxrpc, keyname, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(key)) - return key; - - ret = key_instantiate_and_link(key, NULL, 0, NULL, NULL); - if (ret < 0) { - key_revoke(key); - key_put(key); - return ERR_PTR(ret); - } - - return key; -} -EXPORT_SYMBOL(rxrpc_get_null_key); - -/* - * read the contents of an rxrpc key - * - this returns the result in XDR form - */ -static long rxrpc_read(const struct key *key, - char __user *buffer, size_t buflen) -{ - const struct rxrpc_key_token *token; - const struct krb5_principal *princ; - size_t size; - __be32 __user *xdr, *oldxdr; - u32 cnlen, toksize, ntoks, tok, zero; - u16 toksizes[AFSTOKEN_MAX]; - int loop; - - _enter(""); - - /* we don't know what form we should return non-AFS keys in */ - if (memcmp(key->description, "afs@", 4) != 0) - return -EOPNOTSUPP; - cnlen = strlen(key->description + 4); - -#define RND(X) (((X) + 3) & ~3) - - /* AFS keys we return in XDR form, so we need to work out the size of - * the XDR */ - size = 2 * 4; /* flags, cellname len */ - size += RND(cnlen); /* cellname */ - size += 1 * 4; /* token count */ - - ntoks = 0; - for (token = key->payload.data[0]; token; token = token->next) { - toksize = 4; /* sec index */ - - switch (token->security_index) { - case RXRPC_SECURITY_RXKAD: - toksize += 8 * 4; /* viceid, kvno, key*2, begin, - * end, primary, tktlen */ - toksize += RND(token->kad->ticket_len); - break; - - case RXRPC_SECURITY_RXK5: - princ = &token->k5->client; - toksize += 4 + princ->n_name_parts * 4; - for (loop = 0; loop < princ->n_name_parts; loop++) - toksize += RND(strlen(princ->name_parts[loop])); - toksize += 4 + RND(strlen(princ->realm)); - - princ = &token->k5->server; - toksize += 4 + princ->n_name_parts * 4; - for (loop = 0; loop < princ->n_name_parts; loop++) - toksize += RND(strlen(princ->name_parts[loop])); - toksize += 4 + RND(strlen(princ->realm)); - - toksize += 8 + RND(token->k5->session.data_len); - - toksize += 4 * 8 + 2 * 4; - - toksize += 4 + token->k5->n_addresses * 8; - for (loop = 0; loop < token->k5->n_addresses; loop++) - toksize += RND(token->k5->addresses[loop].data_len); - - toksize += 4 + RND(token->k5->ticket_len); - toksize += 4 + RND(token->k5->ticket2_len); - - toksize += 4 + token->k5->n_authdata * 8; - for (loop = 0; loop < token->k5->n_authdata; loop++) - toksize += RND(token->k5->authdata[loop].data_len); - break; - - default: /* we have a ticket we can't encode */ - BUG(); - continue; - } - - _debug("token[%u]: toksize=%u", ntoks, toksize); - ASSERTCMP(toksize, <=, AFSTOKEN_LENGTH_MAX); - - toksizes[ntoks++] = toksize; - size += toksize + 4; /* each token has a length word */ - } - -#undef RND - - if (!buffer || buflen < size) - return size; - - xdr = (__be32 __user *) buffer; - zero = 0; -#define ENCODE(x) \ - do { \ - __be32 y = htonl(x); \ - if (put_user(y, xdr++) < 0) \ - goto fault; \ - } while(0) -#define ENCODE_DATA(l, s) \ - do { \ - u32 _l = (l); \ - ENCODE(l); \ - if (copy_to_user(xdr, (s), _l) != 0) \ - goto fault; \ - if (_l & 3 && \ - copy_to_user((u8 __user *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \ - goto fault; \ - xdr += (_l + 3) >> 2; \ - } while(0) -#define ENCODE64(x) \ - do { \ - __be64 y = cpu_to_be64(x); \ - if (copy_to_user(xdr, &y, 8) != 0) \ - goto fault; \ - xdr += 8 >> 2; \ - } while(0) -#define ENCODE_STR(s) \ - do { \ - const char *_s = (s); \ - ENCODE_DATA(strlen(_s), _s); \ - } while(0) - - ENCODE(0); /* flags */ - ENCODE_DATA(cnlen, key->description + 4); /* cellname */ - ENCODE(ntoks); - - tok = 0; - for (token = key->payload.data[0]; token; token = token->next) { - toksize = toksizes[tok++]; - ENCODE(toksize); - oldxdr = xdr; - ENCODE(token->security_index); - - switch (token->security_index) { - case RXRPC_SECURITY_RXKAD: - ENCODE(token->kad->vice_id); - ENCODE(token->kad->kvno); - ENCODE_DATA(8, token->kad->session_key); - ENCODE(token->kad->start); - ENCODE(token->kad->expiry); - ENCODE(token->kad->primary_flag); - ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); - break; - - case RXRPC_SECURITY_RXK5: - princ = &token->k5->client; - ENCODE(princ->n_name_parts); - for (loop = 0; loop < princ->n_name_parts; loop++) - ENCODE_STR(princ->name_parts[loop]); - ENCODE_STR(princ->realm); - - princ = &token->k5->server; - ENCODE(princ->n_name_parts); - for (loop = 0; loop < princ->n_name_parts; loop++) - ENCODE_STR(princ->name_parts[loop]); - ENCODE_STR(princ->realm); - - ENCODE(token->k5->session.tag); - ENCODE_DATA(token->k5->session.data_len, - token->k5->session.data); - - ENCODE64(token->k5->authtime); - ENCODE64(token->k5->starttime); - ENCODE64(token->k5->endtime); - ENCODE64(token->k5->renew_till); - ENCODE(token->k5->is_skey); - ENCODE(token->k5->flags); - - ENCODE(token->k5->n_addresses); - for (loop = 0; loop < token->k5->n_addresses; loop++) { - ENCODE(token->k5->addresses[loop].tag); - ENCODE_DATA(token->k5->addresses[loop].data_len, - token->k5->addresses[loop].data); - } - - ENCODE_DATA(token->k5->ticket_len, token->k5->ticket); - ENCODE_DATA(token->k5->ticket2_len, token->k5->ticket2); - - ENCODE(token->k5->n_authdata); - for (loop = 0; loop < token->k5->n_authdata; loop++) { - ENCODE(token->k5->authdata[loop].tag); - ENCODE_DATA(token->k5->authdata[loop].data_len, - token->k5->authdata[loop].data); - } - break; - - default: - BUG(); - break; - } - - ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==, - toksize); - } - -#undef ENCODE_STR -#undef ENCODE_DATA -#undef ENCODE64 -#undef ENCODE - - ASSERTCMP(tok, ==, ntoks); - ASSERTCMP((char __user *) xdr - buffer, ==, size); - _leave(" = %zu", size); - return size; - -fault: - _leave(" = -EFAULT"); - return -EFAULT; -} diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c deleted file mode 100644 index 111f250b045f..000000000000 --- a/net/rxrpc/ar-local.c +++ /dev/null @@ -1,417 +0,0 @@ -/* AF_RXRPC local endpoint management - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC"; - -static LIST_HEAD(rxrpc_locals); -DEFINE_RWLOCK(rxrpc_local_lock); -static DECLARE_RWSEM(rxrpc_local_sem); -static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq); - -static void rxrpc_destroy_local(struct work_struct *work); -static void rxrpc_process_local_events(struct work_struct *work); - -/* - * allocate a new local - */ -static -struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx) -{ - struct rxrpc_local *local; - - local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); - if (local) { - INIT_WORK(&local->destroyer, &rxrpc_destroy_local); - INIT_WORK(&local->acceptor, &rxrpc_accept_incoming_calls); - INIT_WORK(&local->rejecter, &rxrpc_reject_packets); - INIT_WORK(&local->event_processor, &rxrpc_process_local_events); - INIT_LIST_HEAD(&local->services); - INIT_LIST_HEAD(&local->link); - init_rwsem(&local->defrag_sem); - skb_queue_head_init(&local->accept_queue); - skb_queue_head_init(&local->reject_queue); - skb_queue_head_init(&local->event_queue); - spin_lock_init(&local->lock); - rwlock_init(&local->services_lock); - atomic_set(&local->usage, 1); - local->debug_id = atomic_inc_return(&rxrpc_debug_id); - memcpy(&local->srx, srx, sizeof(*srx)); - } - - _leave(" = %p", local); - return local; -} - -/* - * create the local socket - * - must be called with rxrpc_local_sem writelocked - */ -static int rxrpc_create_local(struct rxrpc_local *local) -{ - struct sock *sock; - int ret, opt; - - _enter("%p{%d}", local, local->srx.transport_type); - - /* create a socket to represent the local endpoint */ - ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type, - IPPROTO_UDP, &local->socket); - if (ret < 0) { - _leave(" = %d [socket]", ret); - return ret; - } - - /* if a local address was supplied then bind it */ - if (local->srx.transport_len > sizeof(sa_family_t)) { - _debug("bind"); - ret = kernel_bind(local->socket, - (struct sockaddr *) &local->srx.transport, - local->srx.transport_len); - if (ret < 0) { - _debug("bind failed"); - goto error; - } - } - - /* we want to receive ICMP errors */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } - - /* we want to set the don't fragment bit */ - opt = IP_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } - - write_lock_bh(&rxrpc_local_lock); - list_add(&local->link, &rxrpc_locals); - write_unlock_bh(&rxrpc_local_lock); - - /* set the socket up */ - sock = local->socket->sk; - sock->sk_user_data = local; - sock->sk_data_ready = rxrpc_data_ready; - sock->sk_error_report = rxrpc_UDP_error_report; - _leave(" = 0"); - return 0; - -error: - kernel_sock_shutdown(local->socket, SHUT_RDWR); - local->socket->sk->sk_user_data = NULL; - sock_release(local->socket); - local->socket = NULL; - - _leave(" = %d", ret); - return ret; -} - -/* - * create a new local endpoint using the specified UDP address - */ -struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *srx) -{ - struct rxrpc_local *local; - int ret; - - _enter("{%d,%u,%pI4+%hu}", - srx->transport_type, - srx->transport.family, - &srx->transport.sin.sin_addr, - ntohs(srx->transport.sin.sin_port)); - - down_write(&rxrpc_local_sem); - - /* see if we have a suitable local local endpoint already */ - read_lock_bh(&rxrpc_local_lock); - - list_for_each_entry(local, &rxrpc_locals, link) { - _debug("CMP {%d,%u,%pI4+%hu}", - local->srx.transport_type, - local->srx.transport.family, - &local->srx.transport.sin.sin_addr, - ntohs(local->srx.transport.sin.sin_port)); - - if (local->srx.transport_type != srx->transport_type || - local->srx.transport.family != srx->transport.family) - continue; - - switch (srx->transport.family) { - case AF_INET: - if (local->srx.transport.sin.sin_port != - srx->transport.sin.sin_port) - continue; - if (memcmp(&local->srx.transport.sin.sin_addr, - &srx->transport.sin.sin_addr, - sizeof(struct in_addr)) != 0) - continue; - goto found_local; - - default: - BUG(); - } - } - - read_unlock_bh(&rxrpc_local_lock); - - /* we didn't find one, so we need to create one */ - local = rxrpc_alloc_local(srx); - if (!local) { - up_write(&rxrpc_local_sem); - return ERR_PTR(-ENOMEM); - } - - ret = rxrpc_create_local(local); - if (ret < 0) { - up_write(&rxrpc_local_sem); - kfree(local); - _leave(" = %d", ret); - return ERR_PTR(ret); - } - - up_write(&rxrpc_local_sem); - - _net("LOCAL new %d {%d,%u,%pI4+%hu}", - local->debug_id, - local->srx.transport_type, - local->srx.transport.family, - &local->srx.transport.sin.sin_addr, - ntohs(local->srx.transport.sin.sin_port)); - - _leave(" = %p [new]", local); - return local; - -found_local: - rxrpc_get_local(local); - read_unlock_bh(&rxrpc_local_lock); - up_write(&rxrpc_local_sem); - - _net("LOCAL old %d {%d,%u,%pI4+%hu}", - local->debug_id, - local->srx.transport_type, - local->srx.transport.family, - &local->srx.transport.sin.sin_addr, - ntohs(local->srx.transport.sin.sin_port)); - - _leave(" = %p [reuse]", local); - return local; -} - -/* - * release a local endpoint - */ -void rxrpc_put_local(struct rxrpc_local *local) -{ - _enter("%p{u=%d}", local, atomic_read(&local->usage)); - - ASSERTCMP(atomic_read(&local->usage), >, 0); - - /* to prevent a race, the decrement and the dequeue must be effectively - * atomic */ - write_lock_bh(&rxrpc_local_lock); - if (unlikely(atomic_dec_and_test(&local->usage))) { - _debug("destroy local"); - rxrpc_queue_work(&local->destroyer); - } - write_unlock_bh(&rxrpc_local_lock); - _leave(""); -} - -/* - * destroy a local endpoint - */ -static void rxrpc_destroy_local(struct work_struct *work) -{ - struct rxrpc_local *local = - container_of(work, struct rxrpc_local, destroyer); - - _enter("%p{%d}", local, atomic_read(&local->usage)); - - down_write(&rxrpc_local_sem); - - write_lock_bh(&rxrpc_local_lock); - if (atomic_read(&local->usage) > 0) { - write_unlock_bh(&rxrpc_local_lock); - up_read(&rxrpc_local_sem); - _leave(" [resurrected]"); - return; - } - - list_del(&local->link); - local->socket->sk->sk_user_data = NULL; - write_unlock_bh(&rxrpc_local_lock); - - downgrade_write(&rxrpc_local_sem); - - ASSERT(list_empty(&local->services)); - ASSERT(!work_pending(&local->acceptor)); - ASSERT(!work_pending(&local->rejecter)); - ASSERT(!work_pending(&local->event_processor)); - - /* finish cleaning up the local descriptor */ - rxrpc_purge_queue(&local->accept_queue); - rxrpc_purge_queue(&local->reject_queue); - rxrpc_purge_queue(&local->event_queue); - kernel_sock_shutdown(local->socket, SHUT_RDWR); - sock_release(local->socket); - - up_read(&rxrpc_local_sem); - - _net("DESTROY LOCAL %d", local->debug_id); - kfree(local); - - if (list_empty(&rxrpc_locals)) - wake_up_all(&rxrpc_local_wq); - - _leave(""); -} - -/* - * preemptively destroy all local local endpoint rather than waiting for - * them to be destroyed - */ -void __exit rxrpc_destroy_all_locals(void) -{ - DECLARE_WAITQUEUE(myself,current); - - _enter(""); - - /* we simply have to wait for them to go away */ - if (!list_empty(&rxrpc_locals)) { - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&rxrpc_local_wq, &myself); - - while (!list_empty(&rxrpc_locals)) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } - - remove_wait_queue(&rxrpc_local_wq, &myself); - set_current_state(TASK_RUNNING); - } - - _leave(""); -} - -/* - * Reply to a version request - */ -static void rxrpc_send_version_request(struct rxrpc_local *local, - struct rxrpc_host_header *hdr, - struct sk_buff *skb) -{ - struct rxrpc_wire_header whdr; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct sockaddr_in sin; - struct msghdr msg; - struct kvec iov[2]; - size_t len; - int ret; - - _enter(""); - - sin.sin_family = AF_INET; - sin.sin_port = udp_hdr(skb)->source; - sin.sin_addr.s_addr = ip_hdr(skb)->saddr; - - msg.msg_name = &sin; - msg.msg_namelen = sizeof(sin); - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.seq = 0; - whdr.serial = 0; - whdr.type = RXRPC_PACKET_TYPE_VERSION; - whdr.flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); - whdr.userStatus = 0; - whdr.securityIndex = 0; - whdr._rsvd = 0; - whdr.serviceId = htons(sp->hdr.serviceId); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = (char *)rxrpc_version_string; - iov[1].iov_len = sizeof(rxrpc_version_string); - - len = iov[0].iov_len + iov[1].iov_len; - - _proto("Tx VERSION (reply)"); - - ret = kernel_sendmsg(local->socket, &msg, iov, 2, len); - if (ret < 0) - _debug("sendmsg failed: %d", ret); - - _leave(""); -} - -/* - * Process event packets targetted at a local endpoint. - */ -static void rxrpc_process_local_events(struct work_struct *work) -{ - struct rxrpc_local *local = container_of(work, struct rxrpc_local, event_processor); - struct sk_buff *skb; - char v; - - _enter(""); - - atomic_inc(&local->usage); - - while ((skb = skb_dequeue(&local->event_queue))) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - _debug("{%d},{%u}", local->debug_id, sp->hdr.type); - - switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_VERSION: - if (skb_copy_bits(skb, 0, &v, 1) < 0) - return; - _proto("Rx VERSION { %02x }", v); - if (v == 0) - rxrpc_send_version_request(local, &sp->hdr, skb); - break; - - default: - /* Just ignore anything we don't understand */ - break; - } - - rxrpc_put_local(local); - rxrpc_free_skb(skb); - } - - rxrpc_put_local(local); - _leave(""); -} diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c deleted file mode 100644 index 2e3c4064e29c..000000000000 --- a/net/rxrpc/ar-output.c +++ /dev/null @@ -1,724 +0,0 @@ -/* RxRPC packet transmission - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * Time till packet resend (in jiffies). - */ -unsigned int rxrpc_resend_timeout = 4 * HZ; - -static int rxrpc_send_data(struct rxrpc_sock *rx, - struct rxrpc_call *call, - struct msghdr *msg, size_t len); - -/* - * extract control messages from the sendmsg() control buffer - */ -static int rxrpc_sendmsg_cmsg(struct msghdr *msg, - unsigned long *user_call_ID, - enum rxrpc_command *command, - u32 *abort_code) -{ - struct cmsghdr *cmsg; - bool got_user_ID = false; - int len; - - *command = RXRPC_CMD_SEND_DATA; - - if (msg->msg_controllen == 0) - return -EINVAL; - - for_each_cmsghdr(cmsg, msg) { - if (!CMSG_OK(msg, cmsg)) - return -EINVAL; - - len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); - _debug("CMSG %d, %d, %d", - cmsg->cmsg_level, cmsg->cmsg_type, len); - - if (cmsg->cmsg_level != SOL_RXRPC) - continue; - - switch (cmsg->cmsg_type) { - case RXRPC_USER_CALL_ID: - if (msg->msg_flags & MSG_CMSG_COMPAT) { - if (len != sizeof(u32)) - return -EINVAL; - *user_call_ID = *(u32 *) CMSG_DATA(cmsg); - } else { - if (len != sizeof(unsigned long)) - return -EINVAL; - *user_call_ID = *(unsigned long *) - CMSG_DATA(cmsg); - } - _debug("User Call ID %lx", *user_call_ID); - got_user_ID = true; - break; - - case RXRPC_ABORT: - if (*command != RXRPC_CMD_SEND_DATA) - return -EINVAL; - *command = RXRPC_CMD_SEND_ABORT; - if (len != sizeof(*abort_code)) - return -EINVAL; - *abort_code = *(unsigned int *) CMSG_DATA(cmsg); - _debug("Abort %x", *abort_code); - if (*abort_code == 0) - return -EINVAL; - break; - - case RXRPC_ACCEPT: - if (*command != RXRPC_CMD_SEND_DATA) - return -EINVAL; - *command = RXRPC_CMD_ACCEPT; - if (len != 0) - return -EINVAL; - break; - - default: - return -EINVAL; - } - } - - if (!got_user_ID) - return -EINVAL; - _leave(" = 0"); - return 0; -} - -/* - * abort a call, sending an ABORT packet to the peer - */ -static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) -{ - write_lock_bh(&call->state_lock); - - if (call->state <= RXRPC_CALL_COMPLETE) { - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = abort_code; - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - del_timer_sync(&call->resend_timer); - del_timer_sync(&call->ack_timer); - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_EV_ACK, &call->events); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - rxrpc_queue_call(call); - } - - write_unlock_bh(&call->state_lock); -} - -/* - * Create a new client call for sendmsg(). - */ -static struct rxrpc_call * -rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, - unsigned long user_call_ID) -{ - struct rxrpc_conn_bundle *bundle; - struct rxrpc_transport *trans; - struct rxrpc_call *call; - struct key *key; - long ret; - - DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); - - _enter(""); - - if (!msg->msg_name) - return ERR_PTR(-EDESTADDRREQ); - - trans = rxrpc_name_to_transport(rx, msg->msg_name, msg->msg_namelen, 0, - GFP_KERNEL); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - - key = rx->key; - if (key && !rx->key->payload.data[0]) - key = NULL; - bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, GFP_KERNEL); - if (IS_ERR(bundle)) { - ret = PTR_ERR(bundle); - goto out_trans; - } - - call = rxrpc_new_client_call(rx, trans, bundle, user_call_ID, - GFP_KERNEL); - rxrpc_put_bundle(trans, bundle); - rxrpc_put_transport(trans); - if (IS_ERR(call)) { - ret = PTR_ERR(call); - goto out_trans; - } - - _leave(" = %p\n", call); - return call; - -out_trans: - rxrpc_put_transport(trans); -out: - _leave(" = %ld", ret); - return ERR_PTR(ret); -} - -/* - * send a message forming part of a client call through an RxRPC socket - * - caller holds the socket locked - * - the socket may be either a client socket or a server socket - */ -int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) -{ - enum rxrpc_command cmd; - struct rxrpc_call *call; - unsigned long user_call_ID = 0; - u32 abort_code = 0; - int ret; - - _enter(""); - - ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code); - if (ret < 0) - return ret; - - if (cmd == RXRPC_CMD_ACCEPT) { - if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) - return -EINVAL; - call = rxrpc_accept_call(rx, user_call_ID); - if (IS_ERR(call)) - return PTR_ERR(call); - rxrpc_put_call(call); - return 0; - } - - call = rxrpc_find_call_by_user_ID(rx, user_call_ID); - if (!call) { - if (cmd != RXRPC_CMD_SEND_DATA) - return -EBADSLT; - call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID); - if (IS_ERR(call)) - return PTR_ERR(call); - } - - _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, call->state, call->conn); - - if (call->state >= RXRPC_CALL_COMPLETE) { - /* it's too late for this call */ - ret = -ECONNRESET; - } else if (cmd == RXRPC_CMD_SEND_ABORT) { - rxrpc_send_abort(call, abort_code); - ret = 0; - } else if (cmd != RXRPC_CMD_SEND_DATA) { - ret = -EINVAL; - } else if (!call->in_clientflag && - call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { - /* request phase complete for this client call */ - ret = -EPROTO; - } else if (call->in_clientflag && - call->state != RXRPC_CALL_SERVER_ACK_REQUEST && - call->state != RXRPC_CALL_SERVER_SEND_REPLY) { - /* Reply phase not begun or not complete for service call. */ - ret = -EPROTO; - } else { - ret = rxrpc_send_data(rx, call, msg, len); - } - - rxrpc_put_call(call); - _leave(" = %d", ret); - return ret; -} - -/** - * rxrpc_kernel_send_data - Allow a kernel service to send data on a call - * @call: The call to send data through - * @msg: The data to send - * @len: The amount of data to send - * - * Allow a kernel service to send data on a call. The call must be in an state - * appropriate to sending data. No control data should be supplied in @msg, - * nor should an address be supplied. MSG_MORE should be flagged if there's - * more data to come, otherwise this data will end the transmission phase. - */ -int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, - size_t len) -{ - int ret; - - _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); - - ASSERTCMP(msg->msg_name, ==, NULL); - ASSERTCMP(msg->msg_control, ==, NULL); - - lock_sock(&call->socket->sk); - - _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, call->state, call->conn); - - if (call->state >= RXRPC_CALL_COMPLETE) { - ret = -ESHUTDOWN; /* it's too late for this call */ - } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && - call->state != RXRPC_CALL_SERVER_ACK_REQUEST && - call->state != RXRPC_CALL_SERVER_SEND_REPLY) { - ret = -EPROTO; /* request phase complete for this client call */ - } else { - ret = rxrpc_send_data(call->socket, call, msg, len); - } - - release_sock(&call->socket->sk); - _leave(" = %d", ret); - return ret; -} - -EXPORT_SYMBOL(rxrpc_kernel_send_data); - -/** - * rxrpc_kernel_abort_call - Allow a kernel service to abort a call - * @call: The call to be aborted - * @abort_code: The abort code to stick into the ABORT packet - * - * Allow a kernel service to abort a call, if it's still in an abortable state. - */ -void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code) -{ - _enter("{%d},%d", call->debug_id, abort_code); - - lock_sock(&call->socket->sk); - - _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, call->state, call->conn); - - if (call->state < RXRPC_CALL_COMPLETE) - rxrpc_send_abort(call, abort_code); - - release_sock(&call->socket->sk); - _leave(""); -} - -EXPORT_SYMBOL(rxrpc_kernel_abort_call); - -/* - * send a packet through the transport endpoint - */ -int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb) -{ - struct kvec iov[1]; - struct msghdr msg; - int ret, opt; - - _enter(",{%d}", skb->len); - - iov[0].iov_base = skb->head; - iov[0].iov_len = skb->len; - - msg.msg_name = &trans->peer->srx.transport.sin; - msg.msg_namelen = sizeof(trans->peer->srx.transport.sin); - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - /* send the packet with the don't fragment bit set if we currently - * think it's small enough */ - if (skb->len - sizeof(struct rxrpc_wire_header) < trans->peer->maxdata) { - down_read(&trans->local->defrag_sem); - /* send the packet by UDP - * - returns -EMSGSIZE if UDP would have to fragment the packet - * to go out of the interface - * - in which case, we'll have processed the ICMP error - * message and update the peer record - */ - ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1, - iov[0].iov_len); - - up_read(&trans->local->defrag_sem); - if (ret == -EMSGSIZE) - goto send_fragmentable; - - _leave(" = %d [%u]", ret, trans->peer->maxdata); - return ret; - } - -send_fragmentable: - /* attempt to send this message with fragmentation enabled */ - _debug("send fragment"); - - down_write(&trans->local->defrag_sem); - opt = IP_PMTUDISC_DONT; - ret = kernel_setsockopt(trans->local->socket, SOL_IP, IP_MTU_DISCOVER, - (char *) &opt, sizeof(opt)); - if (ret == 0) { - ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1, - iov[0].iov_len); - - opt = IP_PMTUDISC_DO; - kernel_setsockopt(trans->local->socket, SOL_IP, - IP_MTU_DISCOVER, (char *) &opt, sizeof(opt)); - } - - up_write(&trans->local->defrag_sem); - _leave(" = %d [frag %u]", ret, trans->peer->maxdata); - return ret; -} - -/* - * wait for space to appear in the transmit/ACK window - * - caller holds the socket locked - */ -static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, - struct rxrpc_call *call, - long *timeo) -{ - DECLARE_WAITQUEUE(myself, current); - int ret; - - _enter(",{%d},%ld", - CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), - call->acks_winsz), - *timeo); - - add_wait_queue(&call->tx_waitq, &myself); - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - ret = 0; - if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), - call->acks_winsz) > 0) - break; - if (signal_pending(current)) { - ret = sock_intr_errno(*timeo); - break; - } - - release_sock(&rx->sk); - *timeo = schedule_timeout(*timeo); - lock_sock(&rx->sk); - } - - remove_wait_queue(&call->tx_waitq, &myself); - set_current_state(TASK_RUNNING); - _leave(" = %d", ret); - return ret; -} - -/* - * attempt to schedule an instant Tx resend - */ -static inline void rxrpc_instant_resend(struct rxrpc_call *call) -{ - read_lock_bh(&call->state_lock); - if (try_to_del_timer_sync(&call->resend_timer) >= 0) { - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) - rxrpc_queue_call(call); - } - read_unlock_bh(&call->state_lock); -} - -/* - * queue a packet for transmission, set the resend timer and attempt - * to send the packet immediately - */ -static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, - bool last) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - int ret; - - _net("queue skb %p [%d]", skb, call->acks_head); - - ASSERT(call->acks_window != NULL); - call->acks_window[call->acks_head] = (unsigned long) skb; - smp_wmb(); - call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1); - - if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { - _debug("________awaiting reply/ACK__________"); - write_lock_bh(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_CLIENT_SEND_REQUEST: - call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; - break; - case RXRPC_CALL_SERVER_ACK_REQUEST: - call->state = RXRPC_CALL_SERVER_SEND_REPLY; - if (!last) - break; - case RXRPC_CALL_SERVER_SEND_REPLY: - call->state = RXRPC_CALL_SERVER_AWAIT_ACK; - break; - default: - break; - } - write_unlock_bh(&call->state_lock); - } - - _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); - - sp->need_resend = false; - sp->resend_at = jiffies + rxrpc_resend_timeout; - if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) { - _debug("run timer"); - call->resend_timer.expires = sp->resend_at; - add_timer(&call->resend_timer); - } - - /* attempt to cancel the rx-ACK timer, deferring reply transmission if - * we're ACK'ing the request phase of an incoming call */ - ret = -EAGAIN; - if (try_to_del_timer_sync(&call->ack_timer) >= 0) { - /* the packet may be freed by rxrpc_process_call() before this - * returns */ - ret = rxrpc_send_packet(call->conn->trans, skb); - _net("sent skb %p", skb); - } else { - _debug("failed to delete ACK timer"); - } - - if (ret < 0) { - _debug("need instant resend %d", ret); - sp->need_resend = true; - rxrpc_instant_resend(call); - } - - _leave(""); -} - -/* - * Convert a host-endian header into a network-endian header. - */ -static void rxrpc_insert_header(struct sk_buff *skb) -{ - struct rxrpc_wire_header whdr; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.seq = htonl(sp->hdr.seq); - whdr.serial = htonl(sp->hdr.serial); - whdr.type = sp->hdr.type; - whdr.flags = sp->hdr.flags; - whdr.userStatus = sp->hdr.userStatus; - whdr.securityIndex = sp->hdr.securityIndex; - whdr._rsvd = htons(sp->hdr._rsvd); - whdr.serviceId = htons(sp->hdr.serviceId); - - memcpy(skb->head, &whdr, sizeof(whdr)); -} - -/* - * send data through a socket - * - must be called in process context - * - caller holds the socket locked - */ -static int rxrpc_send_data(struct rxrpc_sock *rx, - struct rxrpc_call *call, - struct msghdr *msg, size_t len) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - struct sock *sk = &rx->sk; - long timeo; - bool more; - int ret, copied; - - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - - /* this should be in poll */ - sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - return -EPIPE; - - more = msg->msg_flags & MSG_MORE; - - skb = call->tx_pending; - call->tx_pending = NULL; - - copied = 0; - do { - if (!skb) { - size_t size, chunk, max, space; - - _debug("alloc"); - - if (CIRC_SPACE(call->acks_head, - ACCESS_ONCE(call->acks_tail), - call->acks_winsz) <= 0) { - ret = -EAGAIN; - if (msg->msg_flags & MSG_DONTWAIT) - goto maybe_error; - ret = rxrpc_wait_for_tx_window(rx, call, - &timeo); - if (ret < 0) - goto maybe_error; - } - - max = call->conn->trans->peer->maxdata; - max -= call->conn->security_size; - max &= ~(call->conn->size_align - 1UL); - - chunk = max; - if (chunk > msg_data_left(msg) && !more) - chunk = msg_data_left(msg); - - space = chunk + call->conn->size_align; - space &= ~(call->conn->size_align - 1UL); - - size = space + call->conn->header_size; - - _debug("SIZE: %zu/%zu/%zu", chunk, space, size); - - /* create a buffer that we can retain until it's ACK'd */ - skb = sock_alloc_send_skb( - sk, size, msg->msg_flags & MSG_DONTWAIT, &ret); - if (!skb) - goto maybe_error; - - rxrpc_new_skb(skb); - - _debug("ALLOC SEND %p", skb); - - ASSERTCMP(skb->mark, ==, 0); - - _debug("HS: %u", call->conn->header_size); - skb_reserve(skb, call->conn->header_size); - skb->len += call->conn->header_size; - - sp = rxrpc_skb(skb); - sp->remain = chunk; - if (sp->remain > skb_tailroom(skb)) - sp->remain = skb_tailroom(skb); - - _net("skb: hr %d, tr %d, hl %d, rm %d", - skb_headroom(skb), - skb_tailroom(skb), - skb_headlen(skb), - sp->remain); - - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - - _debug("append"); - sp = rxrpc_skb(skb); - - /* append next segment of data to the current buffer */ - if (msg_data_left(msg) > 0) { - int copy = skb_tailroom(skb); - ASSERTCMP(copy, >, 0); - if (copy > msg_data_left(msg)) - copy = msg_data_left(msg); - if (copy > sp->remain) - copy = sp->remain; - - _debug("add"); - ret = skb_add_data(skb, &msg->msg_iter, copy); - _debug("added"); - if (ret < 0) - goto efault; - sp->remain -= copy; - skb->mark += copy; - copied += copy; - } - - /* check for the far side aborting the call or a network error - * occurring */ - if (call->state > RXRPC_CALL_COMPLETE) - goto call_aborted; - - /* add the packet to the send queue if it's now full */ - if (sp->remain <= 0 || - (msg_data_left(msg) == 0 && !more)) { - struct rxrpc_connection *conn = call->conn; - uint32_t seq; - size_t pad; - - /* pad out if we're using security */ - if (conn->security_ix) { - pad = conn->security_size + skb->mark; - pad = conn->size_align - pad; - pad &= conn->size_align - 1; - _debug("pad %zu", pad); - if (pad) - memset(skb_put(skb, pad), 0, pad); - } - - seq = atomic_inc_return(&call->sequence); - - sp->hdr.epoch = conn->epoch; - sp->hdr.cid = call->cid; - sp->hdr.callNumber = call->call_id; - sp->hdr.seq = seq; - sp->hdr.serial = atomic_inc_return(&conn->serial); - sp->hdr.type = RXRPC_PACKET_TYPE_DATA; - sp->hdr.userStatus = 0; - sp->hdr.securityIndex = conn->security_ix; - sp->hdr._rsvd = 0; - sp->hdr.serviceId = call->service_id; - - sp->hdr.flags = conn->out_clientflag; - if (msg_data_left(msg) == 0 && !more) - sp->hdr.flags |= RXRPC_LAST_PACKET; - else if (CIRC_SPACE(call->acks_head, - ACCESS_ONCE(call->acks_tail), - call->acks_winsz) > 1) - sp->hdr.flags |= RXRPC_MORE_PACKETS; - if (more && seq & 1) - sp->hdr.flags |= RXRPC_REQUEST_ACK; - - ret = conn->security->secure_packet( - call, skb, skb->mark, - skb->head + sizeof(struct rxrpc_wire_header)); - if (ret < 0) - goto out; - - rxrpc_insert_header(skb); - rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); - skb = NULL; - } - } while (msg_data_left(msg) > 0); - -success: - ret = copied; -out: - call->tx_pending = skb; - _leave(" = %d", ret); - return ret; - -call_aborted: - rxrpc_free_skb(skb); - if (call->state == RXRPC_CALL_NETWORK_ERROR) - ret = call->conn->trans->peer->net_error; - else - ret = -ECONNABORTED; - _leave(" = %d", ret); - return ret; - -maybe_error: - if (copied) - goto success; - goto out; - -efault: - ret = -EFAULT; - goto out; -} diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c deleted file mode 100644 index 0b54cda3d8e5..000000000000 --- a/net/rxrpc/ar-peer.c +++ /dev/null @@ -1,305 +0,0 @@ -/* RxRPC remote transport endpoint management - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -static LIST_HEAD(rxrpc_peers); -static DEFINE_RWLOCK(rxrpc_peer_lock); -static DECLARE_WAIT_QUEUE_HEAD(rxrpc_peer_wq); - -static void rxrpc_destroy_peer(struct work_struct *work); - -/* - * assess the MTU size for the network interface through which this peer is - * reached - */ -static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) -{ - struct rtable *rt; - struct flowi4 fl4; - - peer->if_mtu = 1500; - - rt = ip_route_output_ports(&init_net, &fl4, NULL, - peer->srx.transport.sin.sin_addr.s_addr, 0, - htons(7000), htons(7001), - IPPROTO_UDP, 0, 0); - if (IS_ERR(rt)) { - _leave(" [route err %ld]", PTR_ERR(rt)); - return; - } - - peer->if_mtu = dst_mtu(&rt->dst); - dst_release(&rt->dst); - - _leave(" [if_mtu %u]", peer->if_mtu); -} - -/* - * allocate a new peer - */ -static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx, - gfp_t gfp) -{ - struct rxrpc_peer *peer; - - _enter(""); - - peer = kzalloc(sizeof(struct rxrpc_peer), gfp); - if (peer) { - INIT_WORK(&peer->destroyer, &rxrpc_destroy_peer); - INIT_LIST_HEAD(&peer->link); - INIT_LIST_HEAD(&peer->error_targets); - spin_lock_init(&peer->lock); - atomic_set(&peer->usage, 1); - peer->debug_id = atomic_inc_return(&rxrpc_debug_id); - memcpy(&peer->srx, srx, sizeof(*srx)); - - rxrpc_assess_MTU_size(peer); - peer->mtu = peer->if_mtu; - - if (srx->transport.family == AF_INET) { - peer->hdrsize = sizeof(struct iphdr); - switch (srx->transport_type) { - case SOCK_DGRAM: - peer->hdrsize += sizeof(struct udphdr); - break; - default: - BUG(); - break; - } - } else { - BUG(); - } - - peer->hdrsize += sizeof(struct rxrpc_wire_header); - peer->maxdata = peer->mtu - peer->hdrsize; - } - - _leave(" = %p", peer); - return peer; -} - -/* - * obtain a remote transport endpoint for the specified address - */ -struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *srx, gfp_t gfp) -{ - struct rxrpc_peer *peer, *candidate; - const char *new = "old"; - int usage; - - _enter("{%d,%d,%pI4+%hu}", - srx->transport_type, - srx->transport_len, - &srx->transport.sin.sin_addr, - ntohs(srx->transport.sin.sin_port)); - - /* search the peer list first */ - read_lock_bh(&rxrpc_peer_lock); - list_for_each_entry(peer, &rxrpc_peers, link) { - _debug("check PEER %d { u=%d t=%d l=%d }", - peer->debug_id, - atomic_read(&peer->usage), - peer->srx.transport_type, - peer->srx.transport_len); - - if (atomic_read(&peer->usage) > 0 && - peer->srx.transport_type == srx->transport_type && - peer->srx.transport_len == srx->transport_len && - memcmp(&peer->srx.transport, - &srx->transport, - srx->transport_len) == 0) - goto found_extant_peer; - } - read_unlock_bh(&rxrpc_peer_lock); - - /* not yet present - create a candidate for a new record and then - * redo the search */ - candidate = rxrpc_alloc_peer(srx, gfp); - if (!candidate) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } - - write_lock_bh(&rxrpc_peer_lock); - - list_for_each_entry(peer, &rxrpc_peers, link) { - if (atomic_read(&peer->usage) > 0 && - peer->srx.transport_type == srx->transport_type && - peer->srx.transport_len == srx->transport_len && - memcmp(&peer->srx.transport, - &srx->transport, - srx->transport_len) == 0) - goto found_extant_second; - } - - /* we can now add the new candidate to the list */ - peer = candidate; - candidate = NULL; - usage = atomic_read(&peer->usage); - - list_add_tail(&peer->link, &rxrpc_peers); - write_unlock_bh(&rxrpc_peer_lock); - new = "new"; - -success: - _net("PEER %s %d {%d,%u,%pI4+%hu}", - new, - peer->debug_id, - peer->srx.transport_type, - peer->srx.transport.family, - &peer->srx.transport.sin.sin_addr, - ntohs(peer->srx.transport.sin.sin_port)); - - _leave(" = %p {u=%d}", peer, usage); - return peer; - - /* we found the peer in the list immediately */ -found_extant_peer: - usage = atomic_inc_return(&peer->usage); - read_unlock_bh(&rxrpc_peer_lock); - goto success; - - /* we found the peer on the second time through the list */ -found_extant_second: - usage = atomic_inc_return(&peer->usage); - write_unlock_bh(&rxrpc_peer_lock); - kfree(candidate); - goto success; -} - -/* - * find the peer associated with a packet - */ -struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *local, - __be32 addr, __be16 port) -{ - struct rxrpc_peer *peer; - - _enter(""); - - /* search the peer list */ - read_lock_bh(&rxrpc_peer_lock); - - if (local->srx.transport.family == AF_INET && - local->srx.transport_type == SOCK_DGRAM - ) { - list_for_each_entry(peer, &rxrpc_peers, link) { - if (atomic_read(&peer->usage) > 0 && - peer->srx.transport_type == SOCK_DGRAM && - peer->srx.transport.family == AF_INET && - peer->srx.transport.sin.sin_port == port && - peer->srx.transport.sin.sin_addr.s_addr == addr) - goto found_UDP_peer; - } - - goto new_UDP_peer; - } - - read_unlock_bh(&rxrpc_peer_lock); - _leave(" = -EAFNOSUPPORT"); - return ERR_PTR(-EAFNOSUPPORT); - -found_UDP_peer: - _net("Rx UDP DGRAM from peer %d", peer->debug_id); - atomic_inc(&peer->usage); - read_unlock_bh(&rxrpc_peer_lock); - _leave(" = %p", peer); - return peer; - -new_UDP_peer: - _net("Rx UDP DGRAM from NEW peer"); - read_unlock_bh(&rxrpc_peer_lock); - _leave(" = -EBUSY [new]"); - return ERR_PTR(-EBUSY); -} - -/* - * release a remote transport endpoint - */ -void rxrpc_put_peer(struct rxrpc_peer *peer) -{ - _enter("%p{u=%d}", peer, atomic_read(&peer->usage)); - - ASSERTCMP(atomic_read(&peer->usage), >, 0); - - if (likely(!atomic_dec_and_test(&peer->usage))) { - _leave(" [in use]"); - return; - } - - rxrpc_queue_work(&peer->destroyer); - _leave(""); -} - -/* - * destroy a remote transport endpoint - */ -static void rxrpc_destroy_peer(struct work_struct *work) -{ - struct rxrpc_peer *peer = - container_of(work, struct rxrpc_peer, destroyer); - - _enter("%p{%d}", peer, atomic_read(&peer->usage)); - - write_lock_bh(&rxrpc_peer_lock); - list_del(&peer->link); - write_unlock_bh(&rxrpc_peer_lock); - - _net("DESTROY PEER %d", peer->debug_id); - kfree(peer); - - if (list_empty(&rxrpc_peers)) - wake_up_all(&rxrpc_peer_wq); - _leave(""); -} - -/* - * preemptively destroy all the peer records from a transport endpoint rather - * than waiting for them to time out - */ -void __exit rxrpc_destroy_all_peers(void) -{ - DECLARE_WAITQUEUE(myself,current); - - _enter(""); - - /* we simply have to wait for them to go away */ - if (!list_empty(&rxrpc_peers)) { - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&rxrpc_peer_wq, &myself); - - while (!list_empty(&rxrpc_peers)) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } - - remove_wait_queue(&rxrpc_peer_wq, &myself); - set_current_state(TASK_RUNNING); - } - - _leave(""); -} diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c deleted file mode 100644 index 225163bc658d..000000000000 --- a/net/rxrpc/ar-proc.c +++ /dev/null @@ -1,192 +0,0 @@ -/* /proc/net/ support for AF_RXRPC - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include "ar-internal.h" - -static const char *const rxrpc_conn_states[] = { - [RXRPC_CONN_UNUSED] = "Unused ", - [RXRPC_CONN_CLIENT] = "Client ", - [RXRPC_CONN_SERVER_UNSECURED] = "SvUnsec ", - [RXRPC_CONN_SERVER_CHALLENGING] = "SvChall ", - [RXRPC_CONN_SERVER] = "SvSecure", - [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort", - [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort", - [RXRPC_CONN_NETWORK_ERROR] = "NetError", -}; - -/* - * generate a list of extant and dead calls in /proc/net/rxrpc_calls - */ -static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos) -{ - read_lock(&rxrpc_call_lock); - return seq_list_start_head(&rxrpc_calls, *_pos); -} - -static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - return seq_list_next(v, &rxrpc_calls, pos); -} - -static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) -{ - read_unlock(&rxrpc_call_lock); -} - -static int rxrpc_call_seq_show(struct seq_file *seq, void *v) -{ - struct rxrpc_transport *trans; - struct rxrpc_call *call; - char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; - - if (v == &rxrpc_calls) { - seq_puts(seq, - "Proto Local Remote " - " SvID ConnID CallID End Use State Abort " - " UserID\n"); - return 0; - } - - call = list_entry(v, struct rxrpc_call, link); - trans = call->conn->trans; - - sprintf(lbuff, "%pI4:%u", - &trans->local->srx.transport.sin.sin_addr, - ntohs(trans->local->srx.transport.sin.sin_port)); - - sprintf(rbuff, "%pI4:%u", - &trans->peer->srx.transport.sin.sin_addr, - ntohs(trans->peer->srx.transport.sin.sin_port)); - - seq_printf(seq, - "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" - " %-8.8s %08x %lx\n", - lbuff, - rbuff, - call->conn->service_id, - call->cid, - call->call_id, - call->conn->in_clientflag ? "Svc" : "Clt", - atomic_read(&call->usage), - rxrpc_call_states[call->state], - call->remote_abort ?: call->local_abort, - call->user_call_ID); - - return 0; -} - -static const struct seq_operations rxrpc_call_seq_ops = { - .start = rxrpc_call_seq_start, - .next = rxrpc_call_seq_next, - .stop = rxrpc_call_seq_stop, - .show = rxrpc_call_seq_show, -}; - -static int rxrpc_call_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rxrpc_call_seq_ops); -} - -const struct file_operations rxrpc_call_seq_fops = { - .owner = THIS_MODULE, - .open = rxrpc_call_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -/* - * generate a list of extant virtual connections in /proc/net/rxrpc_conns - */ -static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos) -{ - read_lock(&rxrpc_connection_lock); - return seq_list_start_head(&rxrpc_connections, *_pos); -} - -static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v, - loff_t *pos) -{ - return seq_list_next(v, &rxrpc_connections, pos); -} - -static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) -{ - read_unlock(&rxrpc_connection_lock); -} - -static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) -{ - struct rxrpc_connection *conn; - struct rxrpc_transport *trans; - char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; - - if (v == &rxrpc_connections) { - seq_puts(seq, - "Proto Local Remote " - " SvID ConnID Calls End Use State Key " - " Serial ISerial\n" - ); - return 0; - } - - conn = list_entry(v, struct rxrpc_connection, link); - trans = conn->trans; - - sprintf(lbuff, "%pI4:%u", - &trans->local->srx.transport.sin.sin_addr, - ntohs(trans->local->srx.transport.sin.sin_port)); - - sprintf(rbuff, "%pI4:%u", - &trans->peer->srx.transport.sin.sin_addr, - ntohs(trans->peer->srx.transport.sin.sin_port)); - - seq_printf(seq, - "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" - " %s %08x %08x %08x\n", - lbuff, - rbuff, - conn->service_id, - conn->cid, - conn->call_counter, - conn->in_clientflag ? "Svc" : "Clt", - atomic_read(&conn->usage), - rxrpc_conn_states[conn->state], - key_serial(conn->key), - atomic_read(&conn->serial), - atomic_read(&conn->hi_serial)); - - return 0; -} - -static const struct seq_operations rxrpc_connection_seq_ops = { - .start = rxrpc_connection_seq_start, - .next = rxrpc_connection_seq_next, - .stop = rxrpc_connection_seq_stop, - .show = rxrpc_connection_seq_show, -}; - - -static int rxrpc_connection_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rxrpc_connection_seq_ops); -} - -const struct file_operations rxrpc_connection_seq_fops = { - .owner = THIS_MODULE, - .open = rxrpc_connection_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c deleted file mode 100644 index 59706b9f2f7a..000000000000 --- a/net/rxrpc/ar-recvmsg.c +++ /dev/null @@ -1,436 +0,0 @@ -/* RxRPC recvmsg() implementation - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * removal a call's user ID from the socket tree to make the user ID available - * again and so that it won't be seen again in association with that call - */ -void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call) -{ - _debug("RELEASE CALL %d", call->debug_id); - - if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - write_lock_bh(&rx->call_lock); - rb_erase(&call->sock_node, &call->socket->calls); - clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); - write_unlock_bh(&rx->call_lock); - } - - read_lock_bh(&call->state_lock); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); -} - -/* - * receive a message from an RxRPC socket - * - we need to be careful about two or more threads calling recvmsg - * simultaneously - */ -int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) -{ - struct rxrpc_skb_priv *sp; - struct rxrpc_call *call = NULL, *continue_call = NULL; - struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - struct sk_buff *skb; - long timeo; - int copy, ret, ullen, offset, copied = 0; - u32 abort_code; - - DEFINE_WAIT(wait); - - _enter(",,,%zu,%d", len, flags); - - if (flags & (MSG_OOB | MSG_TRUNC)) - return -EOPNOTSUPP; - - ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long); - - timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); - msg->msg_flags |= MSG_MORE; - - lock_sock(&rx->sk); - - for (;;) { - /* return immediately if a client socket has no outstanding - * calls */ - if (RB_EMPTY_ROOT(&rx->calls)) { - if (copied) - goto out; - if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) { - release_sock(&rx->sk); - if (continue_call) - rxrpc_put_call(continue_call); - return -ENODATA; - } - } - - /* get the next message on the Rx queue */ - skb = skb_peek(&rx->sk.sk_receive_queue); - if (!skb) { - /* nothing remains on the queue */ - if (copied && - (flags & MSG_PEEK || timeo == 0)) - goto out; - - /* wait for a message to turn up */ - release_sock(&rx->sk); - prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, - TASK_INTERRUPTIBLE); - ret = sock_error(&rx->sk); - if (ret) - goto wait_error; - - if (skb_queue_empty(&rx->sk.sk_receive_queue)) { - if (signal_pending(current)) - goto wait_interrupted; - timeo = schedule_timeout(timeo); - } - finish_wait(sk_sleep(&rx->sk), &wait); - lock_sock(&rx->sk); - continue; - } - - peek_next_packet: - sp = rxrpc_skb(skb); - call = sp->call; - ASSERT(call != NULL); - - _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]); - - /* make sure we wait for the state to be updated in this call */ - spin_lock_bh(&call->lock); - spin_unlock_bh(&call->lock); - - if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { - _debug("packet from released call"); - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); - continue; - } - - /* determine whether to continue last data receive */ - if (continue_call) { - _debug("maybe cont"); - if (call != continue_call || - skb->mark != RXRPC_SKB_MARK_DATA) { - release_sock(&rx->sk); - rxrpc_put_call(continue_call); - _leave(" = %d [noncont]", copied); - return copied; - } - } - - rxrpc_get_call(call); - - /* copy the peer address and timestamp */ - if (!continue_call) { - if (msg->msg_name) { - size_t len = - sizeof(call->conn->trans->peer->srx); - memcpy(msg->msg_name, - &call->conn->trans->peer->srx, len); - msg->msg_namelen = len; - } - sock_recv_timestamp(msg, &rx->sk, skb); - } - - /* receive the message */ - if (skb->mark != RXRPC_SKB_MARK_DATA) - goto receive_non_data_message; - - _debug("recvmsg DATA #%u { %d, %d }", - sp->hdr.seq, skb->len, sp->offset); - - if (!continue_call) { - /* only set the control data once per recvmsg() */ - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - ullen, &call->user_call_ID); - if (ret < 0) - goto copy_error; - ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); - } - - ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); - ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); - call->rx_data_recv = sp->hdr.seq; - - ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); - - offset = sp->offset; - copy = skb->len - offset; - if (copy > len - copied) - copy = len - copied; - - ret = skb_copy_datagram_msg(skb, offset, msg, copy); - - if (ret < 0) - goto copy_error; - - /* handle piecemeal consumption of data packets */ - _debug("copied %d+%d", copy, copied); - - offset += copy; - copied += copy; - - if (!(flags & MSG_PEEK)) - sp->offset = offset; - - if (sp->offset < skb->len) { - _debug("buffer full"); - ASSERTCMP(copied, ==, len); - break; - } - - /* we transferred the whole data packet */ - if (sp->hdr.flags & RXRPC_LAST_PACKET) { - _debug("last"); - if (call->conn->out_clientflag) { - /* last byte of reply received */ - ret = copied; - goto terminal_message; - } - - /* last bit of request received */ - if (!(flags & MSG_PEEK)) { - _debug("eat packet"); - if (skb_dequeue(&rx->sk.sk_receive_queue) != - skb) - BUG(); - rxrpc_free_skb(skb); - } - msg->msg_flags &= ~MSG_MORE; - break; - } - - /* move on to the next data message */ - _debug("next"); - if (!continue_call) - continue_call = sp->call; - else - rxrpc_put_call(call); - call = NULL; - - if (flags & MSG_PEEK) { - _debug("peek next"); - skb = skb->next; - if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue) - break; - goto peek_next_packet; - } - - _debug("eat packet"); - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); - } - - /* end of non-terminal data packet reception for the moment */ - _debug("end rcv data"); -out: - release_sock(&rx->sk); - if (call) - rxrpc_put_call(call); - if (continue_call) - rxrpc_put_call(continue_call); - _leave(" = %d [data]", copied); - return copied; - - /* handle non-DATA messages such as aborts, incoming connections and - * final ACKs */ -receive_non_data_message: - _debug("non-data"); - - if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) { - _debug("RECV NEW CALL"); - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code); - if (ret < 0) - goto copy_error; - if (!(flags & MSG_PEEK)) { - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); - } - goto out; - } - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - ullen, &call->user_call_ID); - if (ret < 0) - goto copy_error; - ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); - - switch (skb->mark) { - case RXRPC_SKB_MARK_DATA: - BUG(); - case RXRPC_SKB_MARK_FINAL_ACK: - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code); - break; - case RXRPC_SKB_MARK_BUSY: - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code); - break; - case RXRPC_SKB_MARK_REMOTE_ABORT: - abort_code = call->remote_abort; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); - break; - case RXRPC_SKB_MARK_LOCAL_ABORT: - abort_code = call->local_abort; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); - break; - case RXRPC_SKB_MARK_NET_ERROR: - _debug("RECV NET ERROR %d", sp->error); - abort_code = sp->error; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code); - break; - case RXRPC_SKB_MARK_LOCAL_ERROR: - _debug("RECV LOCAL ERROR %d", sp->error); - abort_code = sp->error; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, - &abort_code); - break; - default: - pr_err("Unknown packet mark %u\n", skb->mark); - BUG(); - break; - } - - if (ret < 0) - goto copy_error; - -terminal_message: - _debug("terminal"); - msg->msg_flags &= ~MSG_MORE; - msg->msg_flags |= MSG_EOR; - - if (!(flags & MSG_PEEK)) { - _net("free terminal skb %p", skb); - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); - rxrpc_remove_user_ID(rx, call); - } - - release_sock(&rx->sk); - rxrpc_put_call(call); - if (continue_call) - rxrpc_put_call(continue_call); - _leave(" = %d", ret); - return ret; - -copy_error: - _debug("copy error"); - release_sock(&rx->sk); - rxrpc_put_call(call); - if (continue_call) - rxrpc_put_call(continue_call); - _leave(" = %d", ret); - return ret; - -wait_interrupted: - ret = sock_intr_errno(timeo); -wait_error: - finish_wait(sk_sleep(&rx->sk), &wait); - if (continue_call) - rxrpc_put_call(continue_call); - if (copied) - copied = ret; - _leave(" = %d [waitfail %d]", copied, ret); - return copied; - -} - -/** - * rxrpc_kernel_data_delivered - Record delivery of data message - * @skb: Message holding data - * - * Record the delivery of a data message. This permits RxRPC to keep its - * tracking correct. The socket buffer will be deleted. - */ -void rxrpc_kernel_data_delivered(struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_call *call = sp->call; - - ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); - ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); - call->rx_data_recv = sp->hdr.seq; - - ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); - rxrpc_free_skb(skb); -} - -EXPORT_SYMBOL(rxrpc_kernel_data_delivered); - -/** - * rxrpc_kernel_is_data_last - Determine if data message is last one - * @skb: Message holding data - * - * Determine if data message is last one for the parent call. - */ -bool rxrpc_kernel_is_data_last(struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA); - - return sp->hdr.flags & RXRPC_LAST_PACKET; -} - -EXPORT_SYMBOL(rxrpc_kernel_is_data_last); - -/** - * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message - * @skb: Message indicating an abort - * - * Get the abort code from an RxRPC abort message. - */ -u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - switch (skb->mark) { - case RXRPC_SKB_MARK_REMOTE_ABORT: - return sp->call->remote_abort; - case RXRPC_SKB_MARK_LOCAL_ABORT: - return sp->call->local_abort; - default: - BUG(); - } -} - -EXPORT_SYMBOL(rxrpc_kernel_get_abort_code); - -/** - * rxrpc_kernel_get_error - Get the error number from an RxRPC error message - * @skb: Message indicating an error - * - * Get the error number from an RxRPC error message. - */ -int rxrpc_kernel_get_error_number(struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - return sp->error; -} - -EXPORT_SYMBOL(rxrpc_kernel_get_error_number); diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c deleted file mode 100644 index d223253b22fa..000000000000 --- a/net/rxrpc/ar-security.c +++ /dev/null @@ -1,168 +0,0 @@ -/* RxRPC security handling - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -static LIST_HEAD(rxrpc_security_methods); -static DECLARE_RWSEM(rxrpc_security_sem); - -static const struct rxrpc_security *rxrpc_security_types[] = { - [RXRPC_SECURITY_NONE] = &rxrpc_no_security, -#ifdef CONFIG_RXKAD - [RXRPC_SECURITY_RXKAD] = &rxkad, -#endif -}; - -int __init rxrpc_init_security(void) -{ - int i, ret; - - for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) { - if (rxrpc_security_types[i]) { - ret = rxrpc_security_types[i]->init(); - if (ret < 0) - goto failed; - } - } - - return 0; - -failed: - for (i--; i >= 0; i--) - if (rxrpc_security_types[i]) - rxrpc_security_types[i]->exit(); - return ret; -} - -void rxrpc_exit_security(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) - if (rxrpc_security_types[i]) - rxrpc_security_types[i]->exit(); -} - -/* - * look up an rxrpc security module - */ -static const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) -{ - if (security_index >= ARRAY_SIZE(rxrpc_security_types)) - return NULL; - return rxrpc_security_types[security_index]; -} - -/* - * initialise the security on a client connection - */ -int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) -{ - const struct rxrpc_security *sec; - struct rxrpc_key_token *token; - struct key *key = conn->key; - int ret; - - _enter("{%d},{%x}", conn->debug_id, key_serial(key)); - - if (!key) - return 0; - - ret = key_validate(key); - if (ret < 0) - return ret; - - token = key->payload.data[0]; - if (!token) - return -EKEYREJECTED; - - sec = rxrpc_security_lookup(token->security_index); - if (!sec) - return -EKEYREJECTED; - conn->security = sec; - - ret = conn->security->init_connection_security(conn); - if (ret < 0) { - conn->security = &rxrpc_no_security; - return ret; - } - - _leave(" = 0"); - return 0; -} - -/* - * initialise the security on a server connection - */ -int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) -{ - const struct rxrpc_security *sec; - struct rxrpc_local *local = conn->trans->local; - struct rxrpc_sock *rx; - struct key *key; - key_ref_t kref; - char kdesc[5 + 1 + 3 + 1]; - - _enter(""); - - sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix); - - sec = rxrpc_security_lookup(conn->security_ix); - if (!sec) { - _leave(" = -ENOKEY [lookup]"); - return -ENOKEY; - } - - /* find the service */ - read_lock_bh(&local->services_lock); - list_for_each_entry(rx, &local->services, listen_link) { - if (rx->srx.srx_service == conn->service_id) - goto found_service; - } - - /* the service appears to have died */ - read_unlock_bh(&local->services_lock); - _leave(" = -ENOENT"); - return -ENOENT; - -found_service: - if (!rx->securities) { - read_unlock_bh(&local->services_lock); - _leave(" = -ENOKEY"); - return -ENOKEY; - } - - /* look through the service's keyring */ - kref = keyring_search(make_key_ref(rx->securities, 1UL), - &key_type_rxrpc_s, kdesc); - if (IS_ERR(kref)) { - read_unlock_bh(&local->services_lock); - _leave(" = %ld [search]", PTR_ERR(kref)); - return PTR_ERR(kref); - } - - key = key_ref_to_ptr(kref); - read_unlock_bh(&local->services_lock); - - conn->server_key = key; - conn->security = sec; - - _leave(" = 0"); - return 0; -} diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c deleted file mode 100644 index eee0cfd9ac8c..000000000000 --- a/net/rxrpc/ar-skbuff.c +++ /dev/null @@ -1,138 +0,0 @@ -/* ar-skbuff.c: socket buffer destruction handling - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * set up for the ACK at the end of the receive phase when we discard the final - * receive phase data packet - * - called with softirqs disabled - */ -static void rxrpc_request_final_ACK(struct rxrpc_call *call) -{ - /* the call may be aborted before we have a chance to ACK it */ - write_lock(&call->state_lock); - - switch (call->state) { - case RXRPC_CALL_CLIENT_RECV_REPLY: - call->state = RXRPC_CALL_CLIENT_FINAL_ACK; - _debug("request final ACK"); - - /* get an extra ref on the call for the final-ACK generator to - * release */ - rxrpc_get_call(call); - set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); - if (try_to_del_timer_sync(&call->ack_timer) >= 0) - rxrpc_queue_call(call); - break; - - case RXRPC_CALL_SERVER_RECV_REQUEST: - call->state = RXRPC_CALL_SERVER_ACK_REQUEST; - default: - break; - } - - write_unlock(&call->state_lock); -} - -/* - * drop the bottom ACK off of the call ACK window and advance the window - */ -static void rxrpc_hard_ACK_data(struct rxrpc_call *call, - struct rxrpc_skb_priv *sp) -{ - int loop; - u32 seq; - - spin_lock_bh(&call->lock); - - _debug("hard ACK #%u", sp->hdr.seq); - - for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { - call->ackr_window[loop] >>= 1; - call->ackr_window[loop] |= - call->ackr_window[loop + 1] << (BITS_PER_LONG - 1); - } - - seq = sp->hdr.seq; - ASSERTCMP(seq, ==, call->rx_data_eaten + 1); - call->rx_data_eaten = seq; - - if (call->ackr_win_top < UINT_MAX) - call->ackr_win_top++; - - ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, - call->rx_data_post, >=, call->rx_data_recv); - ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, - call->rx_data_recv, >=, call->rx_data_eaten); - - if (sp->hdr.flags & RXRPC_LAST_PACKET) { - rxrpc_request_final_ACK(call); - } else if (atomic_dec_and_test(&call->ackr_not_idle) && - test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) { - /* We previously soft-ACK'd some received packets that have now - * been consumed, so send a hard-ACK if no more packets are - * immediately forthcoming to allow the transmitter to free up - * its Tx bufferage. - */ - _debug("send Rx idle ACK"); - __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial, - false); - } - - spin_unlock_bh(&call->lock); -} - -/* - * destroy a packet that has an RxRPC control buffer - * - advance the hard-ACK state of the parent call (done here in case something - * in the kernel bypasses recvmsg() and steals the packet directly off of the - * socket receive queue) - */ -void rxrpc_packet_destructor(struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_call *call = sp->call; - - _enter("%p{%p}", skb, call); - - if (call) { - /* send the final ACK on a client call */ - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) - rxrpc_hard_ACK_data(call, sp); - rxrpc_put_call(call); - sp->call = NULL; - } - - if (skb->sk) - sock_rfree(skb); - _leave(""); -} - -/** - * rxrpc_kernel_free_skb - Free an RxRPC socket buffer - * @skb: The socket buffer to be freed - * - * Let RxRPC free its own socket buffer, permitting it to maintain debug - * accounting. - */ -void rxrpc_kernel_free_skb(struct sk_buff *skb) -{ - rxrpc_free_skb(skb); -} -EXPORT_SYMBOL(rxrpc_kernel_free_skb); diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c deleted file mode 100644 index a1b65183b07d..000000000000 --- a/net/rxrpc/ar-transport.c +++ /dev/null @@ -1,286 +0,0 @@ -/* RxRPC point-to-point transport session management - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * Time after last use at which transport record is cleaned up. - */ -unsigned int rxrpc_transport_expiry = 3600 * 24; - -static void rxrpc_transport_reaper(struct work_struct *work); - -static LIST_HEAD(rxrpc_transports); -static DEFINE_RWLOCK(rxrpc_transport_lock); -static DECLARE_DELAYED_WORK(rxrpc_transport_reap, rxrpc_transport_reaper); - -/* - * allocate a new transport session manager - */ -static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local, - struct rxrpc_peer *peer, - gfp_t gfp) -{ - struct rxrpc_transport *trans; - - _enter(""); - - trans = kzalloc(sizeof(struct rxrpc_transport), gfp); - if (trans) { - trans->local = local; - trans->peer = peer; - INIT_LIST_HEAD(&trans->link); - trans->bundles = RB_ROOT; - trans->client_conns = RB_ROOT; - trans->server_conns = RB_ROOT; - skb_queue_head_init(&trans->error_queue); - spin_lock_init(&trans->client_lock); - rwlock_init(&trans->conn_lock); - atomic_set(&trans->usage, 1); - trans->conn_idcounter = peer->srx.srx_service << 16; - trans->debug_id = atomic_inc_return(&rxrpc_debug_id); - - if (peer->srx.transport.family == AF_INET) { - switch (peer->srx.transport_type) { - case SOCK_DGRAM: - INIT_WORK(&trans->error_handler, - rxrpc_UDP_error_handler); - break; - default: - BUG(); - break; - } - } else { - BUG(); - } - } - - _leave(" = %p", trans); - return trans; -} - -/* - * obtain a transport session for the nominated endpoints - */ -struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *local, - struct rxrpc_peer *peer, - gfp_t gfp) -{ - struct rxrpc_transport *trans, *candidate; - const char *new = "old"; - int usage; - - _enter("{%pI4+%hu},{%pI4+%hu},", - &local->srx.transport.sin.sin_addr, - ntohs(local->srx.transport.sin.sin_port), - &peer->srx.transport.sin.sin_addr, - ntohs(peer->srx.transport.sin.sin_port)); - - /* search the transport list first */ - read_lock_bh(&rxrpc_transport_lock); - list_for_each_entry(trans, &rxrpc_transports, link) { - if (trans->local == local && trans->peer == peer) - goto found_extant_transport; - } - read_unlock_bh(&rxrpc_transport_lock); - - /* not yet present - create a candidate for a new record and then - * redo the search */ - candidate = rxrpc_alloc_transport(local, peer, gfp); - if (!candidate) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } - - write_lock_bh(&rxrpc_transport_lock); - - list_for_each_entry(trans, &rxrpc_transports, link) { - if (trans->local == local && trans->peer == peer) - goto found_extant_second; - } - - /* we can now add the new candidate to the list */ - trans = candidate; - candidate = NULL; - usage = atomic_read(&trans->usage); - - rxrpc_get_local(trans->local); - atomic_inc(&trans->peer->usage); - list_add_tail(&trans->link, &rxrpc_transports); - write_unlock_bh(&rxrpc_transport_lock); - new = "new"; - -success: - _net("TRANSPORT %s %d local %d -> peer %d", - new, - trans->debug_id, - trans->local->debug_id, - trans->peer->debug_id); - - _leave(" = %p {u=%d}", trans, usage); - return trans; - - /* we found the transport in the list immediately */ -found_extant_transport: - usage = atomic_inc_return(&trans->usage); - read_unlock_bh(&rxrpc_transport_lock); - goto success; - - /* we found the transport on the second time through the list */ -found_extant_second: - usage = atomic_inc_return(&trans->usage); - write_unlock_bh(&rxrpc_transport_lock); - kfree(candidate); - goto success; -} - -/* - * find the transport connecting two endpoints - */ -struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *local, - struct rxrpc_peer *peer) -{ - struct rxrpc_transport *trans; - - _enter("{%pI4+%hu},{%pI4+%hu},", - &local->srx.transport.sin.sin_addr, - ntohs(local->srx.transport.sin.sin_port), - &peer->srx.transport.sin.sin_addr, - ntohs(peer->srx.transport.sin.sin_port)); - - /* search the transport list */ - read_lock_bh(&rxrpc_transport_lock); - - list_for_each_entry(trans, &rxrpc_transports, link) { - if (trans->local == local && trans->peer == peer) - goto found_extant_transport; - } - - read_unlock_bh(&rxrpc_transport_lock); - _leave(" = NULL"); - return NULL; - -found_extant_transport: - atomic_inc(&trans->usage); - read_unlock_bh(&rxrpc_transport_lock); - _leave(" = %p", trans); - return trans; -} - -/* - * release a transport session - */ -void rxrpc_put_transport(struct rxrpc_transport *trans) -{ - _enter("%p{u=%d}", trans, atomic_read(&trans->usage)); - - ASSERTCMP(atomic_read(&trans->usage), >, 0); - - trans->put_time = ktime_get_seconds(); - if (unlikely(atomic_dec_and_test(&trans->usage))) { - _debug("zombie"); - /* let the reaper determine the timeout to avoid a race with - * overextending the timeout if the reaper is running at the - * same time */ - rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); - } - _leave(""); -} - -/* - * clean up a transport session - */ -static void rxrpc_cleanup_transport(struct rxrpc_transport *trans) -{ - _net("DESTROY TRANS %d", trans->debug_id); - - rxrpc_purge_queue(&trans->error_queue); - - rxrpc_put_local(trans->local); - rxrpc_put_peer(trans->peer); - kfree(trans); -} - -/* - * reap dead transports that have passed their expiry date - */ -static void rxrpc_transport_reaper(struct work_struct *work) -{ - struct rxrpc_transport *trans, *_p; - unsigned long now, earliest, reap_time; - - LIST_HEAD(graveyard); - - _enter(""); - - now = ktime_get_seconds(); - earliest = ULONG_MAX; - - /* extract all the transports that have been dead too long */ - write_lock_bh(&rxrpc_transport_lock); - list_for_each_entry_safe(trans, _p, &rxrpc_transports, link) { - _debug("reap TRANS %d { u=%d t=%ld }", - trans->debug_id, atomic_read(&trans->usage), - (long) now - (long) trans->put_time); - - if (likely(atomic_read(&trans->usage) > 0)) - continue; - - reap_time = trans->put_time + rxrpc_transport_expiry; - if (reap_time <= now) - list_move_tail(&trans->link, &graveyard); - else if (reap_time < earliest) - earliest = reap_time; - } - write_unlock_bh(&rxrpc_transport_lock); - - if (earliest != ULONG_MAX) { - _debug("reschedule reaper %ld", (long) earliest - now); - ASSERTCMP(earliest, >, now); - rxrpc_queue_delayed_work(&rxrpc_transport_reap, - (earliest - now) * HZ); - } - - /* then destroy all those pulled out */ - while (!list_empty(&graveyard)) { - trans = list_entry(graveyard.next, struct rxrpc_transport, - link); - list_del_init(&trans->link); - - ASSERTCMP(atomic_read(&trans->usage), ==, 0); - rxrpc_cleanup_transport(trans); - } - - _leave(""); -} - -/* - * preemptively destroy all the transport session records rather than waiting - * for them to time out - */ -void __exit rxrpc_destroy_all_transports(void) -{ - _enter(""); - - rxrpc_transport_expiry = 0; - cancel_delayed_work(&rxrpc_transport_reap); - rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); - - _leave(""); -} diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c new file mode 100644 index 000000000000..eea5f4a5d8b1 --- /dev/null +++ b/net/rxrpc/call_accept.c @@ -0,0 +1,518 @@ +/* incoming call handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * generate a connection-level abort + */ +static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, + struct rxrpc_wire_header *whdr) +{ + struct msghdr msg; + struct kvec iov[1]; + size_t len; + int ret; + + _enter("%d,,", local->debug_id); + + whdr->type = RXRPC_PACKET_TYPE_BUSY; + whdr->serial = htonl(1); + + msg.msg_name = &srx->transport.sin; + msg.msg_namelen = sizeof(srx->transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + iov[0].iov_base = whdr; + iov[0].iov_len = sizeof(*whdr); + + len = iov[0].iov_len; + + _proto("Tx BUSY %%1"); + + ret = kernel_sendmsg(local->socket, &msg, iov, 1, len); + if (ret < 0) { + _leave(" = -EAGAIN [sendmsg failed: %d]", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * accept an incoming call that needs peer, transport and/or connection setting + * up + */ +static int rxrpc_accept_incoming_call(struct rxrpc_local *local, + struct rxrpc_sock *rx, + struct sk_buff *skb, + struct sockaddr_rxrpc *srx) +{ + struct rxrpc_connection *conn; + struct rxrpc_transport *trans; + struct rxrpc_skb_priv *sp, *nsp; + struct rxrpc_peer *peer; + struct rxrpc_call *call; + struct sk_buff *notification; + int ret; + + _enter(""); + + sp = rxrpc_skb(skb); + + /* get a notification message to send to the server app */ + notification = alloc_skb(0, GFP_NOFS); + if (!notification) { + _debug("no memory"); + ret = -ENOMEM; + goto error_nofree; + } + rxrpc_new_skb(notification); + notification->mark = RXRPC_SKB_MARK_NEW_CALL; + + peer = rxrpc_get_peer(srx, GFP_NOIO); + if (IS_ERR(peer)) { + _debug("no peer"); + ret = -EBUSY; + goto error; + } + + trans = rxrpc_get_transport(local, peer, GFP_NOIO); + rxrpc_put_peer(peer); + if (IS_ERR(trans)) { + _debug("no trans"); + ret = -EBUSY; + goto error; + } + + conn = rxrpc_incoming_connection(trans, &sp->hdr); + rxrpc_put_transport(trans); + if (IS_ERR(conn)) { + _debug("no conn"); + ret = PTR_ERR(conn); + goto error; + } + + call = rxrpc_incoming_call(rx, conn, &sp->hdr); + rxrpc_put_connection(conn); + if (IS_ERR(call)) { + _debug("no call"); + ret = PTR_ERR(call); + goto error; + } + + /* attach the call to the socket */ + read_lock_bh(&local->services_lock); + if (rx->sk.sk_state == RXRPC_CLOSE) + goto invalid_service; + + write_lock(&rx->call_lock); + if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) { + rxrpc_get_call(call); + + spin_lock(&call->conn->state_lock); + if (sp->hdr.securityIndex > 0 && + call->conn->state == RXRPC_CONN_SERVER_UNSECURED) { + _debug("await conn sec"); + list_add_tail(&call->accept_link, &rx->secureq); + call->conn->state = RXRPC_CONN_SERVER_CHALLENGING; + atomic_inc(&call->conn->usage); + set_bit(RXRPC_CONN_CHALLENGE, &call->conn->events); + rxrpc_queue_conn(call->conn); + } else { + _debug("conn ready"); + call->state = RXRPC_CALL_SERVER_ACCEPTING; + list_add_tail(&call->accept_link, &rx->acceptq); + rxrpc_get_call(call); + nsp = rxrpc_skb(notification); + nsp->call = call; + + ASSERTCMP(atomic_read(&call->usage), >=, 3); + + _debug("notify"); + spin_lock(&call->lock); + ret = rxrpc_queue_rcv_skb(call, notification, true, + false); + spin_unlock(&call->lock); + notification = NULL; + BUG_ON(ret < 0); + } + spin_unlock(&call->conn->state_lock); + + _debug("queued"); + } + write_unlock(&rx->call_lock); + + _debug("process"); + rxrpc_fast_process_packet(call, skb); + + _debug("done"); + read_unlock_bh(&local->services_lock); + rxrpc_free_skb(notification); + rxrpc_put_call(call); + _leave(" = 0"); + return 0; + +invalid_service: + _debug("invalid"); + read_unlock_bh(&local->services_lock); + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { + rxrpc_get_call(call); + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); + rxrpc_put_call(call); + ret = -ECONNREFUSED; +error: + rxrpc_free_skb(notification); +error_nofree: + _leave(" = %d", ret); + return ret; +} + +/* + * accept incoming calls that need peer, transport and/or connection setting up + * - the packets we get are all incoming client DATA packets that have seq == 1 + */ +void rxrpc_accept_incoming_calls(struct work_struct *work) +{ + struct rxrpc_local *local = + container_of(work, struct rxrpc_local, acceptor); + struct rxrpc_skb_priv *sp; + struct sockaddr_rxrpc srx; + struct rxrpc_sock *rx; + struct rxrpc_wire_header whdr; + struct sk_buff *skb; + int ret; + + _enter("%d", local->debug_id); + + read_lock_bh(&rxrpc_local_lock); + if (atomic_read(&local->usage) > 0) + rxrpc_get_local(local); + else + local = NULL; + read_unlock_bh(&rxrpc_local_lock); + if (!local) { + _leave(" [local dead]"); + return; + } + +process_next_packet: + skb = skb_dequeue(&local->accept_queue); + if (!skb) { + rxrpc_put_local(local); + _leave("\n"); + return; + } + + _net("incoming call skb %p", skb); + + sp = rxrpc_skb(skb); + + /* Set up a response packet header in case we need it */ + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = htonl(sp->hdr.seq); + whdr.serial = 0; + whdr.flags = 0; + whdr.type = 0; + whdr.userStatus = 0; + whdr.securityIndex = sp->hdr.securityIndex; + whdr._rsvd = 0; + whdr.serviceId = htons(sp->hdr.serviceId); + + /* determine the remote address */ + memset(&srx, 0, sizeof(srx)); + srx.srx_family = AF_RXRPC; + srx.transport.family = local->srx.transport.family; + srx.transport_type = local->srx.transport_type; + switch (srx.transport.family) { + case AF_INET: + srx.transport_len = sizeof(struct sockaddr_in); + srx.transport.sin.sin_port = udp_hdr(skb)->source; + srx.transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + break; + default: + goto busy; + } + + /* get the socket providing the service */ + read_lock_bh(&local->services_lock); + list_for_each_entry(rx, &local->services, listen_link) { + if (rx->srx.srx_service == sp->hdr.serviceId && + rx->sk.sk_state != RXRPC_CLOSE) + goto found_service; + } + read_unlock_bh(&local->services_lock); + goto invalid_service; + +found_service: + _debug("found service %hd", rx->srx.srx_service); + if (sk_acceptq_is_full(&rx->sk)) + goto backlog_full; + sk_acceptq_added(&rx->sk); + sock_hold(&rx->sk); + read_unlock_bh(&local->services_lock); + + ret = rxrpc_accept_incoming_call(local, rx, skb, &srx); + if (ret < 0) + sk_acceptq_removed(&rx->sk); + sock_put(&rx->sk); + switch (ret) { + case -ECONNRESET: /* old calls are ignored */ + case -ECONNABORTED: /* aborted calls are reaborted or ignored */ + case 0: + goto process_next_packet; + case -ECONNREFUSED: + goto invalid_service; + case -EBUSY: + goto busy; + case -EKEYREJECTED: + goto security_mismatch; + default: + BUG(); + } + +backlog_full: + read_unlock_bh(&local->services_lock); +busy: + rxrpc_busy(local, &srx, &whdr); + rxrpc_free_skb(skb); + goto process_next_packet; + +invalid_service: + skb->priority = RX_INVALID_OPERATION; + rxrpc_reject_packet(local, skb); + goto process_next_packet; + + /* can't change connection security type mid-flow */ +security_mismatch: + skb->priority = RX_PROTOCOL_ERROR; + rxrpc_reject_packet(local, skb); + goto process_next_packet; +} + +/* + * handle acceptance of a call by userspace + * - assign the user call ID to the call at the front of the queue + */ +struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + struct rb_node *parent, **pp; + int ret; + + _enter(",%lx", user_call_ID); + + ASSERT(!irqs_disabled()); + + write_lock(&rx->call_lock); + + ret = -ENODATA; + if (list_empty(&rx->acceptq)) + goto out; + + /* check the user ID isn't already in use */ + ret = -EBADSLT; + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + call = rb_entry(parent, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > call->user_call_ID) + pp = &(*pp)->rb_right; + else + goto out; + } + + /* dequeue the first call and check it's still valid */ + call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_SERVER_ACCEPTING: + call->state = RXRPC_CALL_SERVER_RECV_REQUEST; + break; + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_LOCALLY_ABORTED: + ret = -ECONNABORTED; + goto out_release; + case RXRPC_CALL_NETWORK_ERROR: + ret = call->conn->error; + goto out_release; + case RXRPC_CALL_DEAD: + ret = -ETIME; + goto out_discard; + default: + BUG(); + } + + /* formalise the acceptance */ + call->user_call_ID = user_call_ID; + rb_link_node(&call->sock_node, parent, pp); + rb_insert_color(&call->sock_node, &rx->calls); + if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags)) + BUG(); + if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) + BUG(); + rxrpc_queue_call(call); + + rxrpc_get_call(call); + write_unlock_bh(&call->state_lock); + write_unlock(&rx->call_lock); + _leave(" = %p{%d}", call, call->debug_id); + return call; + + /* if the call is already dying or dead, then we leave the socket's ref + * on it to be released by rxrpc_dead_call_expired() as induced by + * rxrpc_release_call() */ +out_release: + _debug("release %p", call); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) + rxrpc_queue_call(call); +out_discard: + write_unlock_bh(&call->state_lock); + _debug("discard %p", call); +out: + write_unlock(&rx->call_lock); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * Handle rejection of a call by userspace + * - reject the call at the front of the queue + */ +int rxrpc_reject_call(struct rxrpc_sock *rx) +{ + struct rxrpc_call *call; + int ret; + + _enter(""); + + ASSERT(!irqs_disabled()); + + write_lock(&rx->call_lock); + + ret = -ENODATA; + if (list_empty(&rx->acceptq)) + goto out; + + /* dequeue the first call and check it's still valid */ + call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_SERVER_ACCEPTING: + call->state = RXRPC_CALL_SERVER_BUSY; + if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) + rxrpc_queue_call(call); + ret = 0; + goto out_release; + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_LOCALLY_ABORTED: + ret = -ECONNABORTED; + goto out_release; + case RXRPC_CALL_NETWORK_ERROR: + ret = call->conn->error; + goto out_release; + case RXRPC_CALL_DEAD: + ret = -ETIME; + goto out_discard; + default: + BUG(); + } + + /* if the call is already dying or dead, then we leave the socket's ref + * on it to be released by rxrpc_dead_call_expired() as induced by + * rxrpc_release_call() */ +out_release: + _debug("release %p", call); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) + rxrpc_queue_call(call); +out_discard: + write_unlock_bh(&call->state_lock); + _debug("discard %p", call); +out: + write_unlock(&rx->call_lock); + _leave(" = %d", ret); + return ret; +} + +/** + * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call + * @sock: The socket on which the impending call is waiting + * @user_call_ID: The tag to attach to the call + * + * Allow a kernel service to accept an incoming call, assuming the incoming + * call is still valid. + */ +struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + + _enter(",%lx", user_call_ID); + call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID); + _leave(" = %p", call); + return call; +} +EXPORT_SYMBOL(rxrpc_kernel_accept_call); + +/** + * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call + * @sock: The socket on which the impending call is waiting + * + * Allow a kernel service to reject an incoming call with a BUSY message, + * assuming the incoming call is still valid. + */ +int rxrpc_kernel_reject_call(struct socket *sock) +{ + int ret; + + _enter(""); + ret = rxrpc_reject_call(rxrpc_sk(sock->sk)); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(rxrpc_kernel_reject_call); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c new file mode 100644 index 000000000000..18381783c2b1 --- /dev/null +++ b/net/rxrpc/call_event.c @@ -0,0 +1,1288 @@ +/* Management of Tx window, Tx resend, ACKs and out-of-sequence reception + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * propose an ACK be sent + */ +void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, + u32 serial, bool immediate) +{ + unsigned long expiry; + s8 prior = rxrpc_ack_priority[ack_reason]; + + ASSERTCMP(prior, >, 0); + + _enter("{%d},%s,%%%x,%u", + call->debug_id, rxrpc_acks(ack_reason), serial, immediate); + + if (prior < rxrpc_ack_priority[call->ackr_reason]) { + if (immediate) + goto cancel_timer; + return; + } + + /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial + * numbers */ + if (prior == rxrpc_ack_priority[call->ackr_reason]) { + if (prior <= 4) + call->ackr_serial = serial; + if (immediate) + goto cancel_timer; + return; + } + + call->ackr_reason = ack_reason; + call->ackr_serial = serial; + + switch (ack_reason) { + case RXRPC_ACK_DELAY: + _debug("run delay timer"); + expiry = rxrpc_soft_ack_delay; + goto run_timer; + + case RXRPC_ACK_IDLE: + if (!immediate) { + _debug("run defer timer"); + expiry = rxrpc_idle_ack_delay; + goto run_timer; + } + goto cancel_timer; + + case RXRPC_ACK_REQUESTED: + expiry = rxrpc_requested_ack_delay; + if (!expiry) + goto cancel_timer; + if (!immediate || serial == 1) { + _debug("run defer timer"); + goto run_timer; + } + + default: + _debug("immediate ACK"); + goto cancel_timer; + } + +run_timer: + expiry += jiffies; + if (!timer_pending(&call->ack_timer) || + time_after(call->ack_timer.expires, expiry)) + mod_timer(&call->ack_timer, expiry); + return; + +cancel_timer: + _debug("cancel timer %%%u", serial); + try_to_del_timer_sync(&call->ack_timer); + read_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} + +/* + * propose an ACK be sent, locking the call structure + */ +void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, + u32 serial, bool immediate) +{ + s8 prior = rxrpc_ack_priority[ack_reason]; + + if (prior > rxrpc_ack_priority[call->ackr_reason]) { + spin_lock_bh(&call->lock); + __rxrpc_propose_ACK(call, ack_reason, serial, immediate); + spin_unlock_bh(&call->lock); + } +} + +/* + * set the resend timer + */ +static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, + unsigned long resend_at) +{ + read_lock_bh(&call->state_lock); + if (call->state >= RXRPC_CALL_COMPLETE) + resend = 0; + + if (resend & 1) { + _debug("SET RESEND"); + set_bit(RXRPC_CALL_EV_RESEND, &call->events); + } + + if (resend & 2) { + _debug("MODIFY RESEND TIMER"); + set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + mod_timer(&call->resend_timer, resend_at); + } else { + _debug("KILL RESEND TIMER"); + del_timer_sync(&call->resend_timer); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + read_unlock_bh(&call->state_lock); +} + +/* + * resend packets + */ +static void rxrpc_resend(struct rxrpc_call *call) +{ + struct rxrpc_wire_header *whdr; + struct rxrpc_skb_priv *sp; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + bool stop; + int loop; + u8 resend; + + _enter("{%d,%d,%d,%d},", + call->acks_hard, call->acks_unacked, + atomic_read(&call->sequence), + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); + + stop = false; + resend = 0; + resend_at = 0; + + for (loop = call->acks_tail; + loop != call->acks_head || stop; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + if (*p_txb & 1) + continue; + + txb = (struct sk_buff *) *p_txb; + sp = rxrpc_skb(txb); + + if (sp->need_resend) { + sp->need_resend = false; + + /* each Tx packet has a new serial number */ + sp->hdr.serial = atomic_inc_return(&call->conn->serial); + + whdr = (struct rxrpc_wire_header *)txb->head; + whdr->serial = htonl(sp->hdr.serial); + + _proto("Tx DATA %%%u { #%d }", + sp->hdr.serial, sp->hdr.seq); + if (rxrpc_send_packet(call->conn->trans, txb) < 0) { + stop = true; + sp->resend_at = jiffies + 3; + } else { + sp->resend_at = + jiffies + rxrpc_resend_timeout; + } + } + + if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = true; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(""); +} + +/* + * handle resend timer expiry + */ +static void rxrpc_resend_timer(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + int loop; + u8 resend; + + _enter("%d,%d,%d", + call->acks_tail, call->acks_unacked, call->acks_head); + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + resend = 0; + resend_at = 0; + + for (loop = call->acks_unacked; + loop != call->acks_head; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + ASSERT(!(*p_txb & 1)); + + if (sp->need_resend) { + ; + } else if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = true; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(""); +} + +/* + * process soft ACKs of our transmitted packets + * - these indicate packets the peer has or has not received, but hasn't yet + * given to the consumer, and so can still be discarded and re-requested + */ +static int rxrpc_process_soft_ACKs(struct rxrpc_call *call, + struct rxrpc_ackpacket *ack, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + int loop; + u8 sacks[RXRPC_MAXACKS], resend; + + _enter("{%d,%d},{%d},", + call->acks_hard, + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz), + ack->nAcks); + + if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0) + goto protocol_error; + + resend = 0; + resend_at = 0; + for (loop = 0; loop < ack->nAcks; loop++) { + p_txb = call->acks_window; + p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1); + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + switch (sacks[loop]) { + case RXRPC_ACK_TYPE_ACK: + sp->need_resend = false; + *p_txb |= 1; + break; + case RXRPC_ACK_TYPE_NACK: + sp->need_resend = true; + *p_txb &= ~1; + resend = 1; + break; + default: + _debug("Unsupported ACK type %d", sacks[loop]); + goto protocol_error; + } + } + + smp_mb(); + call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1); + + /* anything not explicitly ACK'd is implicitly NACK'd, but may just not + * have been received or processed yet by the far end */ + for (loop = call->acks_unacked; + loop != call->acks_head; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + if (*p_txb & 1) { + /* packet must have been discarded */ + sp->need_resend = true; + *p_txb &= ~1; + resend |= 1; + } else if (sp->need_resend) { + ; + } else if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = true; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(" = 0"); + return 0; + +protocol_error: + _leave(" = -EPROTO"); + return -EPROTO; +} + +/* + * discard hard-ACK'd packets from the Tx window + */ +static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard) +{ + unsigned long _skb; + int tail = call->acks_tail, old_tail; + int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz); + + _enter("{%u,%u},%u", call->acks_hard, win, hard); + + ASSERTCMP(hard - call->acks_hard, <=, win); + + while (call->acks_hard < hard) { + smp_read_barrier_depends(); + _skb = call->acks_window[tail] & ~1; + rxrpc_free_skb((struct sk_buff *) _skb); + old_tail = tail; + tail = (tail + 1) & (call->acks_winsz - 1); + call->acks_tail = tail; + if (call->acks_unacked == old_tail) + call->acks_unacked = tail; + call->acks_hard++; + } + + wake_up(&call->tx_waitq); +} + +/* + * clear the Tx window in the event of a failure + */ +static void rxrpc_clear_tx_window(struct rxrpc_call *call) +{ + rxrpc_rotate_tx_window(call, atomic_read(&call->sequence)); +} + +/* + * drain the out of sequence received packet queue into the packet Rx queue + */ +static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + bool terminal; + int ret; + + _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos); + + spin_lock_bh(&call->lock); + + ret = -ECONNRESET; + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) + goto socket_unavailable; + + skb = skb_dequeue(&call->rx_oos_queue); + if (skb) { + sp = rxrpc_skb(skb); + + _debug("drain OOS packet %d [%d]", + sp->hdr.seq, call->rx_first_oos); + + if (sp->hdr.seq != call->rx_first_oos) { + skb_queue_head(&call->rx_oos_queue, skb); + call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; + _debug("requeue %p {%u}", skb, call->rx_first_oos); + } else { + skb->mark = RXRPC_SKB_MARK_DATA; + terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && + !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); + ret = rxrpc_queue_rcv_skb(call, skb, true, terminal); + BUG_ON(ret < 0); + _debug("drain #%u", call->rx_data_post); + call->rx_data_post++; + + /* find out what the next packet is */ + skb = skb_peek(&call->rx_oos_queue); + if (skb) + call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; + else + call->rx_first_oos = 0; + _debug("peek %p {%u}", skb, call->rx_first_oos); + } + } + + ret = 0; +socket_unavailable: + spin_unlock_bh(&call->lock); + _leave(" = %d", ret); + return ret; +} + +/* + * insert an out of sequence packet into the buffer + */ +static void rxrpc_insert_oos_packet(struct rxrpc_call *call, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp, *psp; + struct sk_buff *p; + u32 seq; + + sp = rxrpc_skb(skb); + seq = sp->hdr.seq; + _enter(",,{%u}", seq); + + skb->destructor = rxrpc_packet_destructor; + ASSERTCMP(sp->call, ==, NULL); + sp->call = call; + rxrpc_get_call(call); + + /* insert into the buffer in sequence order */ + spin_lock_bh(&call->lock); + + skb_queue_walk(&call->rx_oos_queue, p) { + psp = rxrpc_skb(p); + if (psp->hdr.seq > seq) { + _debug("insert oos #%u before #%u", seq, psp->hdr.seq); + skb_insert(p, skb, &call->rx_oos_queue); + goto inserted; + } + } + + _debug("append oos #%u", seq); + skb_queue_tail(&call->rx_oos_queue, skb); +inserted: + + /* we might now have a new front to the queue */ + if (call->rx_first_oos == 0 || seq < call->rx_first_oos) + call->rx_first_oos = seq; + + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + call->rx_data_post == call->rx_first_oos) { + _debug("drain rx oos now"); + set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); + } + read_unlock(&call->state_lock); + + spin_unlock_bh(&call->lock); + _leave(" [stored #%u]", call->rx_first_oos); +} + +/* + * clear the Tx window on final ACK reception + */ +static void rxrpc_zap_tx_window(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + unsigned long _skb, *acks_window; + u8 winsz = call->acks_winsz; + int tail; + + acks_window = call->acks_window; + call->acks_window = NULL; + + while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) { + tail = call->acks_tail; + smp_read_barrier_depends(); + _skb = acks_window[tail] & ~1; + smp_mb(); + call->acks_tail = (call->acks_tail + 1) & (winsz - 1); + + skb = (struct sk_buff *) _skb; + sp = rxrpc_skb(skb); + _debug("+++ clear Tx %u", sp->hdr.seq); + rxrpc_free_skb(skb); + } + + kfree(acks_window); +} + +/* + * process the extra information that may be appended to an ACK packet + */ +static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int latest, int nAcks) +{ + struct rxrpc_ackinfo ackinfo; + struct rxrpc_peer *peer; + unsigned int mtu; + + if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) { + _leave(" [no ackinfo]"); + return; + } + + _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", + latest, + ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU), + ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max)); + + mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU)); + + peer = call->conn->trans->peer; + if (mtu < peer->maxdata) { + spin_lock_bh(&peer->lock); + peer->maxdata = mtu; + peer->mtu = mtu + peer->hdrsize; + spin_unlock_bh(&peer->lock); + _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); + } +} + +/* + * process packets in the reception queue + */ +static int rxrpc_process_rx_queue(struct rxrpc_call *call, + u32 *_abort_code) +{ + struct rxrpc_ackpacket ack; + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + bool post_ACK; + int latest; + u32 hard, tx; + + _enter(""); + +process_further: + skb = skb_dequeue(&call->rx_queue); + if (!skb) + return -EAGAIN; + + _net("deferred skb %p", skb); + + sp = rxrpc_skb(skb); + + _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state); + + post_ACK = false; + + switch (sp->hdr.type) { + /* data packets that wind up here have been received out of + * order, need security processing or are jumbo packets */ + case RXRPC_PACKET_TYPE_DATA: + _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); + + /* secured packets must be verified and possibly decrypted */ + if (call->conn->security->verify_packet(call, skb, + _abort_code) < 0) + goto protocol_error; + + rxrpc_insert_oos_packet(call, skb); + goto process_further; + + /* partial ACK to process */ + case RXRPC_PACKET_TYPE_ACK: + if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) { + _debug("extraction failure"); + goto protocol_error; + } + if (!skb_pull(skb, sizeof(ack))) + BUG(); + + latest = sp->hdr.serial; + hard = ntohl(ack.firstPacket); + tx = atomic_read(&call->sequence); + + _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + latest, + ntohs(ack.maxSkew), + hard, + ntohl(ack.previousPacket), + ntohl(ack.serial), + rxrpc_acks(ack.reason), + ack.nAcks); + + rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks); + + if (ack.reason == RXRPC_ACK_PING) { + _proto("Rx ACK %%%u PING Request", latest); + rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, + sp->hdr.serial, true); + } + + /* discard any out-of-order or duplicate ACKs */ + if (latest - call->acks_latest <= 0) { + _debug("discard ACK %d <= %d", + latest, call->acks_latest); + goto discard; + } + call->acks_latest = latest; + + if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && + call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY && + call->state != RXRPC_CALL_SERVER_SEND_REPLY && + call->state != RXRPC_CALL_SERVER_AWAIT_ACK) + goto discard; + + _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state); + + if (hard > 0) { + if (hard - 1 > tx) { + _debug("hard-ACK'd packet %d not transmitted" + " (%d top)", + hard - 1, tx); + goto protocol_error; + } + + if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY || + call->state == RXRPC_CALL_SERVER_AWAIT_ACK) && + hard > tx) { + call->acks_hard = tx; + goto all_acked; + } + + smp_rmb(); + rxrpc_rotate_tx_window(call, hard - 1); + } + + if (ack.nAcks > 0) { + if (hard - 1 + ack.nAcks > tx) { + _debug("soft-ACK'd packet %d+%d not" + " transmitted (%d top)", + hard - 1, ack.nAcks, tx); + goto protocol_error; + } + + if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0) + goto protocol_error; + } + goto discard; + + /* complete ACK to process */ + case RXRPC_PACKET_TYPE_ACKALL: + goto all_acked; + + /* abort and busy are handled elsewhere */ + case RXRPC_PACKET_TYPE_BUSY: + case RXRPC_PACKET_TYPE_ABORT: + BUG(); + + /* connection level events - also handled elsewhere */ + case RXRPC_PACKET_TYPE_CHALLENGE: + case RXRPC_PACKET_TYPE_RESPONSE: + case RXRPC_PACKET_TYPE_DEBUG: + BUG(); + } + + /* if we've had a hard ACK that covers all the packets we've sent, then + * that ends that phase of the operation */ +all_acked: + write_lock_bh(&call->state_lock); + _debug("ack all %d", call->state); + + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + break; + case RXRPC_CALL_SERVER_AWAIT_ACK: + _debug("srv complete"); + call->state = RXRPC_CALL_COMPLETE; + post_ACK = true; + break; + case RXRPC_CALL_CLIENT_SEND_REQUEST: + case RXRPC_CALL_SERVER_RECV_REQUEST: + goto protocol_error_unlock; /* can't occur yet */ + default: + write_unlock_bh(&call->state_lock); + goto discard; /* assume packet left over from earlier phase */ + } + + write_unlock_bh(&call->state_lock); + + /* if all the packets we sent are hard-ACK'd, then we can discard + * whatever we've got left */ + _debug("clear Tx %d", + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); + + del_timer_sync(&call->resend_timer); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + + if (call->acks_window) + rxrpc_zap_tx_window(call); + + if (post_ACK) { + /* post the final ACK message for userspace to pick up */ + _debug("post ACK"); + skb->mark = RXRPC_SKB_MARK_FINAL_ACK; + sp->call = call; + rxrpc_get_call(call); + spin_lock_bh(&call->lock); + if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0) + BUG(); + spin_unlock_bh(&call->lock); + goto process_further; + } + +discard: + rxrpc_free_skb(skb); + goto process_further; + +protocol_error_unlock: + write_unlock_bh(&call->state_lock); +protocol_error: + rxrpc_free_skb(skb); + _leave(" = -EPROTO"); + return -EPROTO; +} + +/* + * post a message to the socket Rx queue for recvmsg() to pick up + */ +static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error, + bool fatal) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + int ret; + + _enter("{%d,%lx},%u,%u,%d", + call->debug_id, call->flags, mark, error, fatal); + + /* remove timers and things for fatal messages */ + if (fatal) { + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + + if (mark != RXRPC_SKB_MARK_NEW_CALL && + !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + _leave("[no userid]"); + return 0; + } + + if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { + skb = alloc_skb(0, GFP_NOFS); + if (!skb) + return -ENOMEM; + + rxrpc_new_skb(skb); + + skb->mark = mark; + + sp = rxrpc_skb(skb); + memset(sp, 0, sizeof(*sp)); + sp->error = error; + sp->call = call; + rxrpc_get_call(call); + + spin_lock_bh(&call->lock); + ret = rxrpc_queue_rcv_skb(call, skb, true, fatal); + spin_unlock_bh(&call->lock); + BUG_ON(ret < 0); + } + + return 0; +} + +/* + * handle background processing of incoming call packets and ACK / abort + * generation + */ +void rxrpc_process_call(struct work_struct *work) +{ + struct rxrpc_call *call = + container_of(work, struct rxrpc_call, processor); + struct rxrpc_wire_header whdr; + struct rxrpc_ackpacket ack; + struct rxrpc_ackinfo ackinfo; + struct msghdr msg; + struct kvec iov[5]; + enum rxrpc_call_event genbit; + unsigned long bits; + __be32 data, pad; + size_t len; + int loop, nbit, ioc, ret, mtu; + u32 serial, abort_code = RX_PROTOCOL_ERROR; + u8 *acks = NULL; + + //printk("\n--------------------\n"); + _enter("{%d,%s,%lx} [%lu]", + call->debug_id, rxrpc_call_states[call->state], call->events, + (jiffies - call->creation_jif) / (HZ / 10)); + + if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) { + _debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX"); + return; + } + + /* there's a good chance we're going to have to send a message, so set + * one up in advance */ + msg.msg_name = &call->conn->trans->peer->srx.transport; + msg.msg_namelen = call->conn->trans->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + whdr.epoch = htonl(call->conn->epoch); + whdr.cid = htonl(call->cid); + whdr.callNumber = htonl(call->call_id); + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_ACK; + whdr.flags = call->conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = call->conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(call->service_id); + + memset(iov, 0, sizeof(iov)); + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + + /* deal with events of a final nature */ + if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { + rxrpc_release_call(call); + clear_bit(RXRPC_CALL_EV_RELEASE, &call->events); + } + + if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) { + int error; + + clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); + clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_ABORT, &call->events); + + error = call->conn->trans->peer->net_error; + _debug("post net error %d", error); + + if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR, + error, true) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); + goto kill_ACKs; + } + + if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) { + ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); + + clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_ABORT, &call->events); + + _debug("post conn abort"); + + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + call->conn->error, true) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); + goto kill_ACKs; + } + + if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) { + whdr.type = RXRPC_PACKET_TYPE_BUSY; + genbit = RXRPC_CALL_EV_REJECT_BUSY; + goto send_message; + } + + if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) { + ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); + + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + ECONNABORTED, true) < 0) + goto no_mem; + whdr.type = RXRPC_PACKET_TYPE_ABORT; + data = htonl(call->local_abort); + iov[1].iov_base = &data; + iov[1].iov_len = sizeof(data); + genbit = RXRPC_CALL_EV_ABORT; + goto send_message; + } + + if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) { + genbit = RXRPC_CALL_EV_ACK_FINAL; + + ack.bufferSpace = htons(8); + ack.maxSkew = 0; + ack.serial = 0; + ack.reason = RXRPC_ACK_IDLE; + ack.nAcks = 0; + call->ackr_reason = 0; + + spin_lock_bh(&call->lock); + ack.serial = htonl(call->ackr_serial); + ack.previousPacket = htonl(call->ackr_prev_seq); + ack.firstPacket = htonl(call->rx_data_eaten + 1); + spin_unlock_bh(&call->lock); + + pad = 0; + + iov[1].iov_base = &ack; + iov[1].iov_len = sizeof(ack); + iov[2].iov_base = &pad; + iov[2].iov_len = 3; + iov[3].iov_base = &ackinfo; + iov[3].iov_len = sizeof(ackinfo); + goto send_ACK; + } + + if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) | + (1 << RXRPC_CALL_EV_RCVD_ABORT)) + ) { + u32 mark; + + if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events)) + mark = RXRPC_SKB_MARK_REMOTE_ABORT; + else + mark = RXRPC_SKB_MARK_BUSY; + + _debug("post abort/busy"); + rxrpc_clear_tx_window(call); + if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0) + goto no_mem; + + clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); + goto kill_ACKs; + } + + if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) { + _debug("do implicit ackall"); + rxrpc_clear_tx_window(call); + } + + if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) { + write_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = RX_CALL_TIMEOUT; + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + } + write_unlock_bh(&call->state_lock); + + _debug("post timeout"); + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + ETIME, true) < 0) + goto no_mem; + + clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); + goto kill_ACKs; + } + + /* deal with assorted inbound messages */ + if (!skb_queue_empty(&call->rx_queue)) { + switch (rxrpc_process_rx_queue(call, &abort_code)) { + case 0: + case -EAGAIN: + break; + case -ENOMEM: + goto no_mem; + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EPROTO: + rxrpc_abort_call(call, abort_code); + goto kill_ACKs; + } + } + + /* handle resending */ + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) + rxrpc_resend_timer(call); + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) + rxrpc_resend(call); + + /* consider sending an ordinary ACK */ + if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { + _debug("send ACK: window: %d - %d { %lx }", + call->rx_data_eaten, call->ackr_win_top, + call->ackr_window[0]); + + if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST && + call->ackr_reason != RXRPC_ACK_PING_RESPONSE) { + /* ACK by sending reply DATA packet in this state */ + clear_bit(RXRPC_CALL_EV_ACK, &call->events); + goto maybe_reschedule; + } + + genbit = RXRPC_CALL_EV_ACK; + + acks = kzalloc(call->ackr_win_top - call->rx_data_eaten, + GFP_NOFS); + if (!acks) + goto no_mem; + + //hdr.flags = RXRPC_SLOW_START_OK; + ack.bufferSpace = htons(8); + ack.maxSkew = 0; + + spin_lock_bh(&call->lock); + ack.reason = call->ackr_reason; + ack.serial = htonl(call->ackr_serial); + ack.previousPacket = htonl(call->ackr_prev_seq); + ack.firstPacket = htonl(call->rx_data_eaten + 1); + + ack.nAcks = 0; + for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { + nbit = loop * BITS_PER_LONG; + for (bits = call->ackr_window[loop]; bits; bits >>= 1 + ) { + _debug("- l=%d n=%d b=%lx", loop, nbit, bits); + if (bits & 1) { + acks[nbit] = RXRPC_ACK_TYPE_ACK; + ack.nAcks = nbit + 1; + } + nbit++; + } + } + call->ackr_reason = 0; + spin_unlock_bh(&call->lock); + + pad = 0; + + iov[1].iov_base = &ack; + iov[1].iov_len = sizeof(ack); + iov[2].iov_base = acks; + iov[2].iov_len = ack.nAcks; + iov[3].iov_base = &pad; + iov[3].iov_len = 3; + iov[4].iov_base = &ackinfo; + iov[4].iov_len = sizeof(ackinfo); + + switch (ack.reason) { + case RXRPC_ACK_REQUESTED: + case RXRPC_ACK_DUPLICATE: + case RXRPC_ACK_OUT_OF_SEQUENCE: + case RXRPC_ACK_EXCEEDS_WINDOW: + case RXRPC_ACK_NOSPACE: + case RXRPC_ACK_PING: + case RXRPC_ACK_PING_RESPONSE: + goto send_ACK_with_skew; + case RXRPC_ACK_DELAY: + case RXRPC_ACK_IDLE: + goto send_ACK; + } + } + + /* handle completion of security negotiations on an incoming + * connection */ + if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) { + _debug("secured"); + spin_lock_bh(&call->lock); + + if (call->state == RXRPC_CALL_SERVER_SECURING) { + _debug("securing"); + write_lock(&call->conn->lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { + _debug("not released"); + call->state = RXRPC_CALL_SERVER_ACCEPTING; + list_move_tail(&call->accept_link, + &call->socket->acceptq); + } + write_unlock(&call->conn->lock); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) + set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); + read_unlock(&call->state_lock); + } + + spin_unlock_bh(&call->lock); + if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) + goto maybe_reschedule; + } + + /* post a notification of an acceptable connection to the app */ + if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) { + _debug("post accept"); + if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL, + 0, false) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); + goto maybe_reschedule; + } + + /* handle incoming call acceptance */ + if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) { + _debug("accepted"); + ASSERTCMP(call->rx_data_post, ==, 0); + call->rx_data_post = 1; + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) + set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); + read_unlock_bh(&call->state_lock); + } + + /* drain the out of sequence received packet queue into the packet Rx + * queue */ + if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) { + while (call->rx_data_post == call->rx_first_oos) + if (rxrpc_drain_rx_oos_queue(call) < 0) + break; + goto maybe_reschedule; + } + + /* other events may have been raised since we started checking */ + goto maybe_reschedule; + +send_ACK_with_skew: + ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) - + ntohl(ack.serial)); +send_ACK: + mtu = call->conn->trans->peer->if_mtu; + mtu -= call->conn->trans->peer->hdrsize; + ackinfo.maxMTU = htonl(mtu); + ackinfo.rwind = htonl(rxrpc_rx_window_size); + + /* permit the peer to send us jumbo packets if it wants to */ + ackinfo.rxMTU = htonl(rxrpc_rx_mtu); + ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max); + + serial = atomic_inc_return(&call->conn->serial); + whdr.serial = htonl(serial); + _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + serial, + ntohs(ack.maxSkew), + ntohl(ack.firstPacket), + ntohl(ack.previousPacket), + ntohl(ack.serial), + rxrpc_acks(ack.reason), + ack.nAcks); + + del_timer_sync(&call->ack_timer); + if (ack.nAcks > 0) + set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags); + goto send_message_2; + +send_message: + _debug("send message"); + + serial = atomic_inc_return(&call->conn->serial); + whdr.serial = htonl(serial); + _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial); +send_message_2: + + len = iov[0].iov_len; + ioc = 1; + if (iov[4].iov_len) { + ioc = 5; + len += iov[4].iov_len; + len += iov[3].iov_len; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[3].iov_len) { + ioc = 4; + len += iov[3].iov_len; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[2].iov_len) { + ioc = 3; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[1].iov_len) { + ioc = 2; + len += iov[1].iov_len; + } + + ret = kernel_sendmsg(call->conn->trans->local->socket, + &msg, iov, ioc, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + goto error; + } + + switch (genbit) { + case RXRPC_CALL_EV_ABORT: + clear_bit(genbit, &call->events); + clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); + goto kill_ACKs; + + case RXRPC_CALL_EV_ACK_FINAL: + write_lock_bh(&call->state_lock); + if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) + call->state = RXRPC_CALL_COMPLETE; + write_unlock_bh(&call->state_lock); + goto kill_ACKs; + + default: + clear_bit(genbit, &call->events); + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + case RXRPC_CALL_CLIENT_RECV_REPLY: + case RXRPC_CALL_SERVER_RECV_REQUEST: + case RXRPC_CALL_SERVER_ACK_REQUEST: + _debug("start ACK timer"); + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, + call->ackr_serial, false); + default: + break; + } + goto maybe_reschedule; + } + +kill_ACKs: + del_timer_sync(&call->ack_timer); + if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) + rxrpc_put_call(call); + clear_bit(RXRPC_CALL_EV_ACK, &call->events); + +maybe_reschedule: + if (call->events || !skb_queue_empty(&call->rx_queue)) { + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + } + + /* don't leave aborted connections on the accept queue */ + if (call->state >= RXRPC_CALL_COMPLETE && + !list_empty(&call->accept_link)) { + _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }", + call, call->events, call->flags, call->conn->cid); + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + } + +error: + clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags); + kfree(acks); + + /* because we don't want two CPUs both processing the work item for one + * call at the same time, we use a flag to note when it's busy; however + * this means there's a race between clearing the flag and setting the + * work pending bit and the work item being processed again */ + if (call->events && !work_pending(&call->processor)) { + _debug("jumpstart %x", call->conn->cid); + rxrpc_queue_call(call); + } + + _leave(""); + return; + +no_mem: + _debug("out of memory"); + goto maybe_reschedule; +} diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c new file mode 100644 index 000000000000..68125dc4cb7c --- /dev/null +++ b/net/rxrpc/call_object.c @@ -0,0 +1,980 @@ +/* RxRPC individual remote procedure call handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * Maximum lifetime of a call (in jiffies). + */ +unsigned int rxrpc_max_call_lifetime = 60 * HZ; + +/* + * Time till dead call expires after last use (in jiffies). + */ +unsigned int rxrpc_dead_call_expiry = 2 * HZ; + +const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { + [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", + [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", + [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", + [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK", + [RXRPC_CALL_SERVER_SECURING] = "SvSecure", + [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept", + [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", + [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", + [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", + [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK", + [RXRPC_CALL_COMPLETE] = "Complete", + [RXRPC_CALL_SERVER_BUSY] = "SvBusy ", + [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort", + [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CALL_NETWORK_ERROR] = "NetError", + [RXRPC_CALL_DEAD] = "Dead ", +}; + +struct kmem_cache *rxrpc_call_jar; +LIST_HEAD(rxrpc_calls); +DEFINE_RWLOCK(rxrpc_call_lock); + +static void rxrpc_destroy_call(struct work_struct *work); +static void rxrpc_call_life_expired(unsigned long _call); +static void rxrpc_dead_call_expired(unsigned long _call); +static void rxrpc_ack_time_expired(unsigned long _call); +static void rxrpc_resend_time_expired(unsigned long _call); + +static DEFINE_SPINLOCK(rxrpc_call_hash_lock); +static DEFINE_HASHTABLE(rxrpc_call_hash, 10); + +/* + * Hash function for rxrpc_call_hash + */ +static unsigned long rxrpc_call_hashfunc( + u8 in_clientflag, + u32 cid, + u32 call_id, + u32 epoch, + u16 service_id, + sa_family_t proto, + void *localptr, + unsigned int addr_size, + const u8 *peer_addr) +{ + const u16 *p; + unsigned int i; + unsigned long key; + + _enter(""); + + key = (unsigned long)localptr; + /* We just want to add up the __be32 values, so forcing the + * cast should be okay. + */ + key += epoch; + key += service_id; + key += call_id; + key += (cid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT; + key += cid & RXRPC_CHANNELMASK; + key += in_clientflag; + key += proto; + /* Step through the peer address in 16-bit portions for speed */ + for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++) + key += *p; + _leave(" key = 0x%lx", key); + return key; +} + +/* + * Add a call to the hashtable + */ +static void rxrpc_call_hash_add(struct rxrpc_call *call) +{ + unsigned long key; + unsigned int addr_size = 0; + + _enter(""); + switch (call->proto) { + case AF_INET: + addr_size = sizeof(call->peer_ip.ipv4_addr); + break; + case AF_INET6: + addr_size = sizeof(call->peer_ip.ipv6_addr); + break; + default: + break; + } + key = rxrpc_call_hashfunc(call->in_clientflag, call->cid, + call->call_id, call->epoch, + call->service_id, call->proto, + call->conn->trans->local, addr_size, + call->peer_ip.ipv6_addr); + /* Store the full key in the call */ + call->hash_key = key; + spin_lock(&rxrpc_call_hash_lock); + hash_add_rcu(rxrpc_call_hash, &call->hash_node, key); + spin_unlock(&rxrpc_call_hash_lock); + _leave(""); +} + +/* + * Remove a call from the hashtable + */ +static void rxrpc_call_hash_del(struct rxrpc_call *call) +{ + _enter(""); + spin_lock(&rxrpc_call_hash_lock); + hash_del_rcu(&call->hash_node); + spin_unlock(&rxrpc_call_hash_lock); + _leave(""); +} + +/* + * Find a call in the hashtable and return it, or NULL if it + * isn't there. + */ +struct rxrpc_call *rxrpc_find_call_hash( + struct rxrpc_host_header *hdr, + void *localptr, + sa_family_t proto, + const void *peer_addr) +{ + unsigned long key; + unsigned int addr_size = 0; + struct rxrpc_call *call = NULL; + struct rxrpc_call *ret = NULL; + u8 in_clientflag = hdr->flags & RXRPC_CLIENT_INITIATED; + + _enter(""); + switch (proto) { + case AF_INET: + addr_size = sizeof(call->peer_ip.ipv4_addr); + break; + case AF_INET6: + addr_size = sizeof(call->peer_ip.ipv6_addr); + break; + default: + break; + } + + key = rxrpc_call_hashfunc(in_clientflag, hdr->cid, hdr->callNumber, + hdr->epoch, hdr->serviceId, + proto, localptr, addr_size, + peer_addr); + hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) { + if (call->hash_key == key && + call->call_id == hdr->callNumber && + call->cid == hdr->cid && + call->in_clientflag == in_clientflag && + call->service_id == hdr->serviceId && + call->proto == proto && + call->local == localptr && + memcmp(call->peer_ip.ipv6_addr, peer_addr, + addr_size) == 0 && + call->epoch == hdr->epoch) { + ret = call; + break; + } + } + _leave(" = %p", ret); + return ret; +} + +/* + * find an extant server call + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("%p,%lx", rx, user_call_ID); + + read_lock(&rx->call_lock); + + p = rx->calls.rb_node; + while (p) { + call = rb_entry(p, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + p = p->rb_left; + else if (user_call_ID > call->user_call_ID) + p = p->rb_right; + else + goto found_extant_call; + } + + read_unlock(&rx->call_lock); + _leave(" = NULL"); + return NULL; + +found_extant_call: + rxrpc_get_call(call); + read_unlock(&rx->call_lock); + _leave(" = %p [%d]", call, atomic_read(&call->usage)); + return call; +} + +/* + * allocate a new call + */ +static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) +{ + struct rxrpc_call *call; + + call = kmem_cache_zalloc(rxrpc_call_jar, gfp); + if (!call) + return NULL; + + call->acks_winsz = 16; + call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long), + gfp); + if (!call->acks_window) { + kmem_cache_free(rxrpc_call_jar, call); + return NULL; + } + + setup_timer(&call->lifetimer, &rxrpc_call_life_expired, + (unsigned long) call); + setup_timer(&call->deadspan, &rxrpc_dead_call_expired, + (unsigned long) call); + setup_timer(&call->ack_timer, &rxrpc_ack_time_expired, + (unsigned long) call); + setup_timer(&call->resend_timer, &rxrpc_resend_time_expired, + (unsigned long) call); + INIT_WORK(&call->destroyer, &rxrpc_destroy_call); + INIT_WORK(&call->processor, &rxrpc_process_call); + INIT_LIST_HEAD(&call->accept_link); + skb_queue_head_init(&call->rx_queue); + skb_queue_head_init(&call->rx_oos_queue); + init_waitqueue_head(&call->tx_waitq); + spin_lock_init(&call->lock); + rwlock_init(&call->state_lock); + atomic_set(&call->usage, 1); + call->debug_id = atomic_inc_return(&rxrpc_debug_id); + call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; + + memset(&call->sock_node, 0xed, sizeof(call->sock_node)); + + call->rx_data_expect = 1; + call->rx_data_eaten = 0; + call->rx_first_oos = 0; + call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size; + call->creation_jif = jiffies; + return call; +} + +/* + * allocate a new client call and attempt to get a connection slot for it + */ +static struct rxrpc_call *rxrpc_alloc_client_call( + struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle, + gfp_t gfp) +{ + struct rxrpc_call *call; + int ret; + + _enter(""); + + ASSERT(rx != NULL); + ASSERT(trans != NULL); + ASSERT(bundle != NULL); + + call = rxrpc_alloc_call(gfp); + if (!call) + return ERR_PTR(-ENOMEM); + + sock_hold(&rx->sk); + call->socket = rx; + call->rx_data_post = 1; + + ret = rxrpc_connect_call(rx, trans, bundle, call, gfp); + if (ret < 0) { + kmem_cache_free(rxrpc_call_jar, call); + return ERR_PTR(ret); + } + + /* Record copies of information for hashtable lookup */ + call->proto = rx->proto; + call->local = trans->local; + switch (call->proto) { + case AF_INET: + call->peer_ip.ipv4_addr = + trans->peer->srx.transport.sin.sin_addr.s_addr; + break; + case AF_INET6: + memcpy(call->peer_ip.ipv6_addr, + trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, + sizeof(call->peer_ip.ipv6_addr)); + break; + } + call->epoch = call->conn->epoch; + call->service_id = call->conn->service_id; + call->in_clientflag = call->conn->in_clientflag; + /* Add the new call to the hashtable */ + rxrpc_call_hash_add(call); + + spin_lock(&call->conn->trans->peer->lock); + list_add(&call->error_link, &call->conn->trans->peer->error_targets); + spin_unlock(&call->conn->trans->peer->lock); + + call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; + add_timer(&call->lifetimer); + + _leave(" = %p", call); + return call; +} + +/* + * set up a call for the given data + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle, + unsigned long user_call_ID, + gfp_t gfp) +{ + struct rxrpc_call *call, *xcall; + struct rb_node *parent, **pp; + + _enter("%p,%d,%d,%lx", + rx, trans->debug_id, bundle ? bundle->debug_id : -1, + user_call_ID); + + call = rxrpc_alloc_client_call(rx, trans, bundle, gfp); + if (IS_ERR(call)) { + _leave(" = %ld", PTR_ERR(call)); + return call; + } + + call->user_call_ID = user_call_ID; + __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); + + write_lock(&rx->call_lock); + + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + xcall = rb_entry(parent, struct rxrpc_call, sock_node); + + if (user_call_ID < xcall->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > xcall->user_call_ID) + pp = &(*pp)->rb_right; + else + goto found_user_ID_now_present; + } + + rxrpc_get_call(call); + + rb_link_node(&call->sock_node, parent, pp); + rb_insert_color(&call->sock_node, &rx->calls); + write_unlock(&rx->call_lock); + + write_lock_bh(&rxrpc_call_lock); + list_add_tail(&call->link, &rxrpc_calls); + write_unlock_bh(&rxrpc_call_lock); + + _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); + + _leave(" = %p [new]", call); + return call; + + /* We unexpectedly found the user ID in the list after taking + * the call_lock. This shouldn't happen unless the user races + * with itself and tries to add the same user ID twice at the + * same time in different threads. + */ +found_user_ID_now_present: + write_unlock(&rx->call_lock); + rxrpc_put_call(call); + _leave(" = -EEXIST [%p]", call); + return ERR_PTR(-EEXIST); +} + +/* + * set up an incoming call + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, + struct rxrpc_connection *conn, + struct rxrpc_host_header *hdr) +{ + struct rxrpc_call *call, *candidate; + struct rb_node **p, *parent; + u32 call_id; + + _enter(",%d", conn->debug_id); + + ASSERT(rx != NULL); + + candidate = rxrpc_alloc_call(GFP_NOIO); + if (!candidate) + return ERR_PTR(-EBUSY); + + candidate->socket = rx; + candidate->conn = conn; + candidate->cid = hdr->cid; + candidate->call_id = hdr->callNumber; + candidate->channel = hdr->cid & RXRPC_CHANNELMASK; + candidate->rx_data_post = 0; + candidate->state = RXRPC_CALL_SERVER_ACCEPTING; + if (conn->security_ix > 0) + candidate->state = RXRPC_CALL_SERVER_SECURING; + + write_lock_bh(&conn->lock); + + /* set the channel for this call */ + call = conn->channels[candidate->channel]; + _debug("channel[%u] is %p", candidate->channel, call); + if (call && call->call_id == hdr->callNumber) { + /* already set; must've been a duplicate packet */ + _debug("extant call [%d]", call->state); + ASSERTCMP(call->conn, ==, conn); + + read_lock(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_LOCALLY_ABORTED: + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) + rxrpc_queue_call(call); + case RXRPC_CALL_REMOTELY_ABORTED: + read_unlock(&call->state_lock); + goto aborted_call; + default: + rxrpc_get_call(call); + read_unlock(&call->state_lock); + goto extant_call; + } + } + + if (call) { + /* it seems the channel is still in use from the previous call + * - ditch the old binding if its call is now complete */ + _debug("CALL: %u { %s }", + call->debug_id, rxrpc_call_states[call->state]); + + if (call->state >= RXRPC_CALL_COMPLETE) { + conn->channels[call->channel] = NULL; + } else { + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -EBUSY"); + return ERR_PTR(-EBUSY); + } + } + + /* check the call number isn't duplicate */ + _debug("check dup"); + call_id = hdr->callNumber; + p = &conn->calls.rb_node; + parent = NULL; + while (*p) { + parent = *p; + call = rb_entry(parent, struct rxrpc_call, conn_node); + + /* The tree is sorted in order of the __be32 value without + * turning it into host order. + */ + if (call_id < call->call_id) + p = &(*p)->rb_left; + else if (call_id > call->call_id) + p = &(*p)->rb_right; + else + goto old_call; + } + + /* make the call available */ + _debug("new call"); + call = candidate; + candidate = NULL; + rb_link_node(&call->conn_node, parent, p); + rb_insert_color(&call->conn_node, &conn->calls); + conn->channels[call->channel] = call; + sock_hold(&rx->sk); + atomic_inc(&conn->usage); + write_unlock_bh(&conn->lock); + + spin_lock(&conn->trans->peer->lock); + list_add(&call->error_link, &conn->trans->peer->error_targets); + spin_unlock(&conn->trans->peer->lock); + + write_lock_bh(&rxrpc_call_lock); + list_add_tail(&call->link, &rxrpc_calls); + write_unlock_bh(&rxrpc_call_lock); + + /* Record copies of information for hashtable lookup */ + call->proto = rx->proto; + call->local = conn->trans->local; + switch (call->proto) { + case AF_INET: + call->peer_ip.ipv4_addr = + conn->trans->peer->srx.transport.sin.sin_addr.s_addr; + break; + case AF_INET6: + memcpy(call->peer_ip.ipv6_addr, + conn->trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, + sizeof(call->peer_ip.ipv6_addr)); + break; + default: + break; + } + call->epoch = conn->epoch; + call->service_id = conn->service_id; + call->in_clientflag = conn->in_clientflag; + /* Add the new call to the hashtable */ + rxrpc_call_hash_add(call); + + _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); + + call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; + add_timer(&call->lifetimer); + _leave(" = %p {%d} [new]", call, call->debug_id); + return call; + +extant_call: + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1); + return call; + +aborted_call: + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -ECONNABORTED"); + return ERR_PTR(-ECONNABORTED); + +old_call: + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -ECONNRESET [old]"); + return ERR_PTR(-ECONNRESET); +} + +/* + * detach a call from a socket and set up for release + */ +void rxrpc_release_call(struct rxrpc_call *call) +{ + struct rxrpc_connection *conn = call->conn; + struct rxrpc_sock *rx = call->socket; + + _enter("{%d,%d,%d,%d}", + call->debug_id, atomic_read(&call->usage), + atomic_read(&call->ackr_not_idle), + call->rx_first_oos); + + spin_lock_bh(&call->lock); + if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) + BUG(); + spin_unlock_bh(&call->lock); + + /* dissociate from the socket + * - the socket's ref on the call is passed to the death timer + */ + _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); + + write_lock_bh(&rx->call_lock); + if (!list_empty(&call->accept_link)) { + _debug("unlinking once-pending call %p { e=%lx f=%lx }", + call, call->events, call->flags); + ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + rb_erase(&call->sock_node, &rx->calls); + memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); + clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); + } + write_unlock_bh(&rx->call_lock); + + /* free up the channel for reuse */ + spin_lock(&conn->trans->client_lock); + write_lock_bh(&conn->lock); + write_lock(&call->state_lock); + + if (conn->channels[call->channel] == call) + conn->channels[call->channel] = NULL; + + if (conn->out_clientflag && conn->bundle) { + conn->avail_calls++; + switch (conn->avail_calls) { + case 1: + list_move_tail(&conn->bundle_link, + &conn->bundle->avail_conns); + case 2 ... RXRPC_MAXCALLS - 1: + ASSERT(conn->channels[0] == NULL || + conn->channels[1] == NULL || + conn->channels[2] == NULL || + conn->channels[3] == NULL); + break; + case RXRPC_MAXCALLS: + list_move_tail(&conn->bundle_link, + &conn->bundle->unused_conns); + ASSERT(conn->channels[0] == NULL && + conn->channels[1] == NULL && + conn->channels[2] == NULL && + conn->channels[3] == NULL); + break; + default: + pr_err("conn->avail_calls=%d\n", conn->avail_calls); + BUG(); + } + } + + spin_unlock(&conn->trans->client_lock); + + if (call->state < RXRPC_CALL_COMPLETE && + call->state != RXRPC_CALL_CLIENT_FINAL_ACK) { + _debug("+++ ABORTING STATE %d +++\n", call->state); + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = RX_CALL_DEAD; + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); + write_unlock_bh(&conn->lock); + + /* clean up the Rx queue */ + if (!skb_queue_empty(&call->rx_queue) || + !skb_queue_empty(&call->rx_oos_queue)) { + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + + _debug("purge Rx queues"); + + spin_lock_bh(&call->lock); + while ((skb = skb_dequeue(&call->rx_queue)) || + (skb = skb_dequeue(&call->rx_oos_queue))) { + sp = rxrpc_skb(skb); + if (sp->call) { + ASSERTCMP(sp->call, ==, call); + rxrpc_put_call(call); + sp->call = NULL; + } + skb->destructor = NULL; + spin_unlock_bh(&call->lock); + + _debug("- zap %s %%%u #%u", + rxrpc_pkts[sp->hdr.type], + sp->hdr.serial, sp->hdr.seq); + rxrpc_free_skb(skb); + spin_lock_bh(&call->lock); + } + spin_unlock_bh(&call->lock); + + ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE); + } + + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + del_timer_sync(&call->lifetimer); + call->deadspan.expires = jiffies + rxrpc_dead_call_expiry; + add_timer(&call->deadspan); + + _leave(""); +} + +/* + * handle a dead call being ready for reaping + */ +static void rxrpc_dead_call_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + write_lock_bh(&call->state_lock); + call->state = RXRPC_CALL_DEAD; + write_unlock_bh(&call->state_lock); + rxrpc_put_call(call); +} + +/* + * mark a call as to be released, aborting it if it's still in progress + * - called with softirqs disabled + */ +static void rxrpc_mark_call_released(struct rxrpc_call *call) +{ + bool sched; + + write_lock(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) { + sched = false; + if (call->state < RXRPC_CALL_COMPLETE) { + _debug("abort call %p", call); + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = RX_CALL_DEAD; + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) + sched = true; + } + if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) + sched = true; + if (sched) + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); +} + +/* + * release all the calls associated with a socket + */ +void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("%p", rx); + + read_lock_bh(&rx->call_lock); + + /* mark all the calls as no longer wanting incoming packets */ + for (p = rb_first(&rx->calls); p; p = rb_next(p)) { + call = rb_entry(p, struct rxrpc_call, sock_node); + rxrpc_mark_call_released(call); + } + + /* kill the not-yet-accepted incoming calls */ + list_for_each_entry(call, &rx->secureq, accept_link) { + rxrpc_mark_call_released(call); + } + + list_for_each_entry(call, &rx->acceptq, accept_link) { + rxrpc_mark_call_released(call); + } + + read_unlock_bh(&rx->call_lock); + _leave(""); +} + +/* + * release a call + */ +void __rxrpc_put_call(struct rxrpc_call *call) +{ + ASSERT(call != NULL); + + _enter("%p{u=%d}", call, atomic_read(&call->usage)); + + ASSERTCMP(atomic_read(&call->usage), >, 0); + + if (atomic_dec_and_test(&call->usage)) { + _debug("call %d dead", call->debug_id); + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + rxrpc_queue_work(&call->destroyer); + } + _leave(""); +} + +/* + * clean up a call + */ +static void rxrpc_cleanup_call(struct rxrpc_call *call) +{ + _net("DESTROY CALL %d", call->debug_id); + + ASSERT(call->socket); + + memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); + + del_timer_sync(&call->lifetimer); + del_timer_sync(&call->deadspan); + del_timer_sync(&call->ack_timer); + del_timer_sync(&call->resend_timer); + + ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); + ASSERTCMP(call->events, ==, 0); + if (work_pending(&call->processor)) { + _debug("defer destroy"); + rxrpc_queue_work(&call->destroyer); + return; + } + + if (call->conn) { + spin_lock(&call->conn->trans->peer->lock); + list_del(&call->error_link); + spin_unlock(&call->conn->trans->peer->lock); + + write_lock_bh(&call->conn->lock); + rb_erase(&call->conn_node, &call->conn->calls); + write_unlock_bh(&call->conn->lock); + rxrpc_put_connection(call->conn); + } + + /* Remove the call from the hash */ + rxrpc_call_hash_del(call); + + if (call->acks_window) { + _debug("kill Tx window %d", + CIRC_CNT(call->acks_head, call->acks_tail, + call->acks_winsz)); + smp_mb(); + while (CIRC_CNT(call->acks_head, call->acks_tail, + call->acks_winsz) > 0) { + struct rxrpc_skb_priv *sp; + unsigned long _skb; + + _skb = call->acks_window[call->acks_tail] & ~1; + sp = rxrpc_skb((struct sk_buff *)_skb); + _debug("+++ clear Tx %u", sp->hdr.seq); + rxrpc_free_skb((struct sk_buff *)_skb); + call->acks_tail = + (call->acks_tail + 1) & (call->acks_winsz - 1); + } + + kfree(call->acks_window); + } + + rxrpc_free_skb(call->tx_pending); + + rxrpc_purge_queue(&call->rx_queue); + ASSERT(skb_queue_empty(&call->rx_oos_queue)); + sock_put(&call->socket->sk); + kmem_cache_free(rxrpc_call_jar, call); +} + +/* + * destroy a call + */ +static void rxrpc_destroy_call(struct work_struct *work) +{ + struct rxrpc_call *call = + container_of(work, struct rxrpc_call, destroyer); + + _enter("%p{%d,%d,%p}", + call, atomic_read(&call->usage), call->channel, call->conn); + + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + + write_lock_bh(&rxrpc_call_lock); + list_del_init(&call->link); + write_unlock_bh(&rxrpc_call_lock); + + rxrpc_cleanup_call(call); + _leave(""); +} + +/* + * preemptively destroy all the call records from a transport endpoint rather + * than waiting for them to time out + */ +void __exit rxrpc_destroy_all_calls(void) +{ + struct rxrpc_call *call; + + _enter(""); + write_lock_bh(&rxrpc_call_lock); + + while (!list_empty(&rxrpc_calls)) { + call = list_entry(rxrpc_calls.next, struct rxrpc_call, link); + _debug("Zapping call %p", call); + + list_del_init(&call->link); + + switch (atomic_read(&call->usage)) { + case 0: + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + break; + case 1: + if (del_timer_sync(&call->deadspan) != 0 && + call->state != RXRPC_CALL_DEAD) + rxrpc_dead_call_expired((unsigned long) call); + if (call->state != RXRPC_CALL_DEAD) + break; + default: + pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n", + call, atomic_read(&call->usage), + atomic_read(&call->ackr_not_idle), + rxrpc_call_states[call->state], + call->flags, call->events); + if (!skb_queue_empty(&call->rx_queue)) + pr_err("Rx queue occupied\n"); + if (!skb_queue_empty(&call->rx_oos_queue)) + pr_err("OOS queue occupied\n"); + break; + } + + write_unlock_bh(&rxrpc_call_lock); + cond_resched(); + write_lock_bh(&rxrpc_call_lock); + } + + write_unlock_bh(&rxrpc_call_lock); + _leave(""); +} + +/* + * handle call lifetime being exceeded + */ +static void rxrpc_call_life_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + _enter("{%d}", call->debug_id); + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); +} + +/* + * handle resend timer expiry + * - may not take call->state_lock as this can deadlock against del_timer_sync() + */ +static void rxrpc_resend_time_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) + rxrpc_queue_call(call); +} + +/* + * handle ACK timer expiry + */ +static void rxrpc_ack_time_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c new file mode 100644 index 000000000000..8bdd692d4862 --- /dev/null +++ b/net/rxrpc/conn_event.c @@ -0,0 +1,403 @@ +/* connection-level event handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * pass a connection-level abort onto all calls on that connection + */ +static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, + u32 abort_code) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("{%d},%x", conn->debug_id, abort_code); + + read_lock_bh(&conn->lock); + + for (p = rb_first(&conn->calls); p; p = rb_next(p)) { + call = rb_entry(p, struct rxrpc_call, conn_node); + write_lock(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = state; + if (state == RXRPC_CALL_LOCALLY_ABORTED) { + call->local_abort = conn->local_abort; + set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); + } else { + call->remote_abort = conn->remote_abort; + set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); + } + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); + } + + read_unlock_bh(&conn->lock); + _leave(""); +} + +/* + * generate a connection-level abort + */ +static int rxrpc_abort_connection(struct rxrpc_connection *conn, + u32 error, u32 abort_code) +{ + struct rxrpc_wire_header whdr; + struct msghdr msg; + struct kvec iov[2]; + __be32 word; + size_t len; + u32 serial; + int ret; + + _enter("%d,,%u,%u", conn->debug_id, error, abort_code); + + /* generate a connection-level abort */ + spin_lock_bh(&conn->state_lock); + if (conn->state < RXRPC_CONN_REMOTELY_ABORTED) { + conn->state = RXRPC_CONN_LOCALLY_ABORTED; + conn->error = error; + spin_unlock_bh(&conn->state_lock); + } else { + spin_unlock_bh(&conn->state_lock); + _leave(" = 0 [already dead]"); + return 0; + } + + rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code); + + msg.msg_name = &conn->trans->peer->srx.transport; + msg.msg_namelen = conn->trans->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + whdr.epoch = htonl(conn->epoch); + whdr.cid = htonl(conn->cid); + whdr.callNumber = 0; + whdr.seq = 0; + whdr.type = RXRPC_PACKET_TYPE_ABORT; + whdr.flags = conn->out_clientflag; + whdr.userStatus = 0; + whdr.securityIndex = conn->security_ix; + whdr._rsvd = 0; + whdr.serviceId = htons(conn->service_id); + + word = htonl(conn->local_abort); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = &word; + iov[1].iov_len = sizeof(word); + + len = iov[0].iov_len + iov[1].iov_len; + + serial = atomic_inc_return(&conn->serial); + whdr.serial = htonl(serial); + _proto("Tx CONN ABORT %%%u { %d }", serial, conn->local_abort); + + ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * mark a call as being on a now-secured channel + * - must be called with softirqs disabled + */ +static void rxrpc_call_is_secure(struct rxrpc_call *call) +{ + _enter("%p", call); + if (call) { + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events)) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + } +} + +/* + * connection-level Rx packet processor + */ +static int rxrpc_process_event(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 *_abort_code) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + __be32 wtmp; + u32 abort_code; + int loop, ret; + + if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { + kleave(" = -ECONNABORTED [%u]", conn->state); + return -ECONNABORTED; + } + + _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_ABORT: + if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) + return -EPROTO; + abort_code = ntohl(wtmp); + _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); + + conn->state = RXRPC_CONN_REMOTELY_ABORTED; + rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, + abort_code); + return -ECONNABORTED; + + case RXRPC_PACKET_TYPE_CHALLENGE: + return conn->security->respond_to_challenge(conn, skb, + _abort_code); + + case RXRPC_PACKET_TYPE_RESPONSE: + ret = conn->security->verify_response(conn, skb, _abort_code); + if (ret < 0) + return ret; + + ret = conn->security->init_connection_security(conn); + if (ret < 0) + return ret; + + conn->security->prime_packet_security(conn); + read_lock_bh(&conn->lock); + spin_lock(&conn->state_lock); + + if (conn->state == RXRPC_CONN_SERVER_CHALLENGING) { + conn->state = RXRPC_CONN_SERVER; + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure(conn->channels[loop]); + } + + spin_unlock(&conn->state_lock); + read_unlock_bh(&conn->lock); + return 0; + + default: + _leave(" = -EPROTO [%u]", sp->hdr.type); + return -EPROTO; + } +} + +/* + * set up security and issue a challenge + */ +static void rxrpc_secure_connection(struct rxrpc_connection *conn) +{ + u32 abort_code; + int ret; + + _enter("{%d}", conn->debug_id); + + ASSERT(conn->security_ix != 0); + + if (!conn->key) { + _debug("set up security"); + ret = rxrpc_init_server_conn_security(conn); + switch (ret) { + case 0: + break; + case -ENOENT: + abort_code = RX_CALL_DEAD; + goto abort; + default: + abort_code = RXKADNOAUTH; + goto abort; + } + } + + if (conn->security->issue_challenge(conn) < 0) { + abort_code = RX_CALL_DEAD; + ret = -ENOMEM; + goto abort; + } + + _leave(""); + return; + +abort: + _debug("abort %d, %d", ret, abort_code); + rxrpc_abort_connection(conn, -ret, abort_code); + _leave(" [aborted]"); +} + +/* + * connection-level event processor + */ +void rxrpc_process_connection(struct work_struct *work) +{ + struct rxrpc_connection *conn = + container_of(work, struct rxrpc_connection, processor); + struct sk_buff *skb; + u32 abort_code = RX_PROTOCOL_ERROR; + int ret; + + _enter("{%d}", conn->debug_id); + + atomic_inc(&conn->usage); + + if (test_and_clear_bit(RXRPC_CONN_CHALLENGE, &conn->events)) { + rxrpc_secure_connection(conn); + rxrpc_put_connection(conn); + } + + /* go through the conn-level event packets, releasing the ref on this + * connection that each one has when we've finished with it */ + while ((skb = skb_dequeue(&conn->rx_queue))) { + ret = rxrpc_process_event(conn, skb, &abort_code); + switch (ret) { + case -EPROTO: + case -EKEYEXPIRED: + case -EKEYREJECTED: + goto protocol_error; + case -EAGAIN: + goto requeue_and_leave; + case -ECONNABORTED: + default: + rxrpc_put_connection(conn); + rxrpc_free_skb(skb); + break; + } + } + +out: + rxrpc_put_connection(conn); + _leave(""); + return; + +requeue_and_leave: + skb_queue_head(&conn->rx_queue, skb); + goto out; + +protocol_error: + if (rxrpc_abort_connection(conn, -ret, abort_code) < 0) + goto requeue_and_leave; + rxrpc_put_connection(conn); + rxrpc_free_skb(skb); + _leave(" [EPROTO]"); + goto out; +} + +/* + * put a packet up for transport-level abort + */ +void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) +{ + CHECK_SLAB_OKAY(&local->usage); + + if (!atomic_inc_not_zero(&local->usage)) { + printk("resurrected on reject\n"); + BUG(); + } + + skb_queue_tail(&local->reject_queue, skb); + rxrpc_queue_work(&local->rejecter); +} + +/* + * reject packets through the local endpoint + */ +void rxrpc_reject_packets(struct work_struct *work) +{ + union { + struct sockaddr sa; + struct sockaddr_in sin; + } sa; + struct rxrpc_skb_priv *sp; + struct rxrpc_wire_header whdr; + struct rxrpc_local *local; + struct sk_buff *skb; + struct msghdr msg; + struct kvec iov[2]; + size_t size; + __be32 code; + + local = container_of(work, struct rxrpc_local, rejecter); + rxrpc_get_local(local); + + _enter("%d", local->debug_id); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = &code; + iov[1].iov_len = sizeof(code); + size = sizeof(whdr) + sizeof(code); + + msg.msg_name = &sa; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + memset(&sa, 0, sizeof(sa)); + sa.sa.sa_family = local->srx.transport.family; + switch (sa.sa.sa_family) { + case AF_INET: + msg.msg_namelen = sizeof(sa.sin); + break; + default: + msg.msg_namelen = 0; + break; + } + + memset(&whdr, 0, sizeof(whdr)); + whdr.type = RXRPC_PACKET_TYPE_ABORT; + + while ((skb = skb_dequeue(&local->reject_queue))) { + sp = rxrpc_skb(skb); + switch (sa.sa.sa_family) { + case AF_INET: + sa.sin.sin_port = udp_hdr(skb)->source; + sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + code = htonl(skb->priority); + + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.serviceId = htons(sp->hdr.serviceId); + whdr.flags = sp->hdr.flags; + whdr.flags ^= RXRPC_CLIENT_INITIATED; + whdr.flags &= RXRPC_CLIENT_INITIATED; + + kernel_sendmsg(local->socket, &msg, iov, 2, size); + break; + + default: + break; + } + + rxrpc_free_skb(skb); + rxrpc_put_local(local); + } + + rxrpc_put_local(local); + _leave(""); +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c new file mode 100644 index 000000000000..8ecde4b77b55 --- /dev/null +++ b/net/rxrpc/conn_object.c @@ -0,0 +1,912 @@ +/* RxRPC virtual connection handler + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * Time till a connection expires after last use (in seconds). + */ +unsigned int rxrpc_connection_expiry = 10 * 60; + +static void rxrpc_connection_reaper(struct work_struct *work); + +LIST_HEAD(rxrpc_connections); +DEFINE_RWLOCK(rxrpc_connection_lock); +static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); + +/* + * allocate a new client connection bundle + */ +static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp) +{ + struct rxrpc_conn_bundle *bundle; + + _enter(""); + + bundle = kzalloc(sizeof(struct rxrpc_conn_bundle), gfp); + if (bundle) { + INIT_LIST_HEAD(&bundle->unused_conns); + INIT_LIST_HEAD(&bundle->avail_conns); + INIT_LIST_HEAD(&bundle->busy_conns); + init_waitqueue_head(&bundle->chanwait); + atomic_set(&bundle->usage, 1); + } + + _leave(" = %p", bundle); + return bundle; +} + +/* + * compare bundle parameters with what we're looking for + * - return -ve, 0 or +ve + */ +static inline +int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle, + struct key *key, u16 service_id) +{ + return (bundle->service_id - service_id) ?: + ((unsigned long)bundle->key - (unsigned long)key); +} + +/* + * get bundle of client connections that a client socket can make use of + */ +struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct key *key, + u16 service_id, + gfp_t gfp) +{ + struct rxrpc_conn_bundle *bundle, *candidate; + struct rb_node *p, *parent, **pp; + + _enter("%p{%x},%x,%hx,", + rx, key_serial(key), trans->debug_id, service_id); + + /* search the extant bundles first for one that matches the specified + * user ID */ + spin_lock(&trans->client_lock); + + p = trans->bundles.rb_node; + while (p) { + bundle = rb_entry(p, struct rxrpc_conn_bundle, node); + + if (rxrpc_cmp_bundle(bundle, key, service_id) < 0) + p = p->rb_left; + else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0) + p = p->rb_right; + else + goto found_extant_bundle; + } + + spin_unlock(&trans->client_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_bundle(gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + candidate->key = key_get(key); + candidate->service_id = service_id; + + spin_lock(&trans->client_lock); + + pp = &trans->bundles.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + bundle = rb_entry(parent, struct rxrpc_conn_bundle, node); + + if (rxrpc_cmp_bundle(bundle, key, service_id) < 0) + pp = &(*pp)->rb_left; + else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0) + pp = &(*pp)->rb_right; + else + goto found_extant_second; + } + + /* second search also failed; add the new bundle */ + bundle = candidate; + candidate = NULL; + + rb_link_node(&bundle->node, parent, pp); + rb_insert_color(&bundle->node, &trans->bundles); + spin_unlock(&trans->client_lock); + _net("BUNDLE new on trans %d", trans->debug_id); + _leave(" = %p [new]", bundle); + return bundle; + + /* we found the bundle in the list immediately */ +found_extant_bundle: + atomic_inc(&bundle->usage); + spin_unlock(&trans->client_lock); + _net("BUNDLE old on trans %d", trans->debug_id); + _leave(" = %p [extant %d]", bundle, atomic_read(&bundle->usage)); + return bundle; + + /* we found the bundle on the second time through the list */ +found_extant_second: + atomic_inc(&bundle->usage); + spin_unlock(&trans->client_lock); + kfree(candidate); + _net("BUNDLE old2 on trans %d", trans->debug_id); + _leave(" = %p [second %d]", bundle, atomic_read(&bundle->usage)); + return bundle; +} + +/* + * release a bundle + */ +void rxrpc_put_bundle(struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle) +{ + _enter("%p,%p{%d}",trans, bundle, atomic_read(&bundle->usage)); + + if (atomic_dec_and_lock(&bundle->usage, &trans->client_lock)) { + _debug("Destroy bundle"); + rb_erase(&bundle->node, &trans->bundles); + spin_unlock(&trans->client_lock); + ASSERT(list_empty(&bundle->unused_conns)); + ASSERT(list_empty(&bundle->avail_conns)); + ASSERT(list_empty(&bundle->busy_conns)); + ASSERTCMP(bundle->num_conns, ==, 0); + key_put(bundle->key); + kfree(bundle); + } + + _leave(""); +} + +/* + * allocate a new connection + */ +static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) +{ + struct rxrpc_connection *conn; + + _enter(""); + + conn = kzalloc(sizeof(struct rxrpc_connection), gfp); + if (conn) { + INIT_WORK(&conn->processor, &rxrpc_process_connection); + INIT_LIST_HEAD(&conn->bundle_link); + conn->calls = RB_ROOT; + skb_queue_head_init(&conn->rx_queue); + conn->security = &rxrpc_no_security; + rwlock_init(&conn->lock); + spin_lock_init(&conn->state_lock); + atomic_set(&conn->usage, 1); + conn->debug_id = atomic_inc_return(&rxrpc_debug_id); + conn->avail_calls = RXRPC_MAXCALLS; + conn->size_align = 4; + conn->header_size = sizeof(struct rxrpc_wire_header); + } + + _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0); + return conn; +} + +/* + * assign a connection ID to a connection and add it to the transport's + * connection lookup tree + * - called with transport client lock held + */ +static void rxrpc_assign_connection_id(struct rxrpc_connection *conn) +{ + struct rxrpc_connection *xconn; + struct rb_node *parent, **p; + __be32 epoch; + u32 cid; + + _enter(""); + + epoch = conn->epoch; + + write_lock_bh(&conn->trans->conn_lock); + + conn->trans->conn_idcounter += RXRPC_CID_INC; + if (conn->trans->conn_idcounter < RXRPC_CID_INC) + conn->trans->conn_idcounter = RXRPC_CID_INC; + cid = conn->trans->conn_idcounter; + +attempt_insertion: + parent = NULL; + p = &conn->trans->client_conns.rb_node; + + while (*p) { + parent = *p; + xconn = rb_entry(parent, struct rxrpc_connection, node); + + if (epoch < xconn->epoch) + p = &(*p)->rb_left; + else if (epoch > xconn->epoch) + p = &(*p)->rb_right; + else if (cid < xconn->cid) + p = &(*p)->rb_left; + else if (cid > xconn->cid) + p = &(*p)->rb_right; + else + goto id_exists; + } + + /* we've found a suitable hole - arrange for this connection to occupy + * it */ + rb_link_node(&conn->node, parent, p); + rb_insert_color(&conn->node, &conn->trans->client_conns); + + conn->cid = cid; + write_unlock_bh(&conn->trans->conn_lock); + _leave(" [CID %x]", cid); + return; + + /* we found a connection with the proposed ID - walk the tree from that + * point looking for the next unused ID */ +id_exists: + for (;;) { + cid += RXRPC_CID_INC; + if (cid < RXRPC_CID_INC) { + cid = RXRPC_CID_INC; + conn->trans->conn_idcounter = cid; + goto attempt_insertion; + } + + parent = rb_next(parent); + if (!parent) + goto attempt_insertion; + + xconn = rb_entry(parent, struct rxrpc_connection, node); + if (epoch < xconn->epoch || + cid < xconn->cid) + goto attempt_insertion; + } +} + +/* + * add a call to a connection's call-by-ID tree + */ +static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, + struct rxrpc_call *call) +{ + struct rxrpc_call *xcall; + struct rb_node *parent, **p; + __be32 call_id; + + write_lock_bh(&conn->lock); + + call_id = call->call_id; + p = &conn->calls.rb_node; + parent = NULL; + while (*p) { + parent = *p; + xcall = rb_entry(parent, struct rxrpc_call, conn_node); + + if (call_id < xcall->call_id) + p = &(*p)->rb_left; + else if (call_id > xcall->call_id) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&call->conn_node, parent, p); + rb_insert_color(&call->conn_node, &conn->calls); + + write_unlock_bh(&conn->lock); +} + +/* + * connect a call on an exclusive connection + */ +static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + u16 service_id, + struct rxrpc_call *call, + gfp_t gfp) +{ + struct rxrpc_connection *conn; + int chan, ret; + + _enter(""); + + conn = rx->conn; + if (!conn) { + /* not yet present - create a candidate for a new connection + * and then redo the check */ + conn = rxrpc_alloc_connection(gfp); + if (!conn) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + conn->trans = trans; + conn->bundle = NULL; + conn->service_id = service_id; + conn->epoch = rxrpc_epoch; + conn->in_clientflag = 0; + conn->out_clientflag = RXRPC_CLIENT_INITIATED; + conn->cid = 0; + conn->state = RXRPC_CONN_CLIENT; + conn->avail_calls = RXRPC_MAXCALLS - 1; + conn->security_level = rx->min_sec_level; + conn->key = key_get(rx->key); + + ret = rxrpc_init_client_conn_security(conn); + if (ret < 0) { + key_put(conn->key); + kfree(conn); + _leave(" = %d [key]", ret); + return ret; + } + + write_lock_bh(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + write_unlock_bh(&rxrpc_connection_lock); + + spin_lock(&trans->client_lock); + atomic_inc(&trans->usage); + + _net("CONNECT EXCL new %d on TRANS %d", + conn->debug_id, conn->trans->debug_id); + + rxrpc_assign_connection_id(conn); + rx->conn = conn; + } else { + spin_lock(&trans->client_lock); + } + + /* we've got a connection with a free channel and we can now attach the + * call to it + * - we're holding the transport's client lock + * - we're holding a reference on the connection + */ + for (chan = 0; chan < RXRPC_MAXCALLS; chan++) + if (!conn->channels[chan]) + goto found_channel; + goto no_free_channels; + +found_channel: + atomic_inc(&conn->usage); + conn->channels[chan] = call; + call->conn = conn; + call->channel = chan; + call->cid = conn->cid | chan; + call->call_id = ++conn->call_counter; + + _net("CONNECT client on conn %d chan %d as call %x", + conn->debug_id, chan, call->call_id); + + spin_unlock(&trans->client_lock); + + rxrpc_add_call_ID_to_conn(conn, call); + _leave(" = 0"); + return 0; + +no_free_channels: + spin_unlock(&trans->client_lock); + _leave(" = -ENOSR"); + return -ENOSR; +} + +/* + * find a connection for a call + * - called in process context with IRQs enabled + */ +int rxrpc_connect_call(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle, + struct rxrpc_call *call, + gfp_t gfp) +{ + struct rxrpc_connection *conn, *candidate; + int chan, ret; + + DECLARE_WAITQUEUE(myself, current); + + _enter("%p,%lx,", rx, call->user_call_ID); + + if (test_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags)) + return rxrpc_connect_exclusive(rx, trans, bundle->service_id, + call, gfp); + + spin_lock(&trans->client_lock); + for (;;) { + /* see if the bundle has a call slot available */ + if (!list_empty(&bundle->avail_conns)) { + _debug("avail"); + conn = list_entry(bundle->avail_conns.next, + struct rxrpc_connection, + bundle_link); + if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { + list_del_init(&conn->bundle_link); + bundle->num_conns--; + continue; + } + if (--conn->avail_calls == 0) + list_move(&conn->bundle_link, + &bundle->busy_conns); + ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); + ASSERT(conn->channels[0] == NULL || + conn->channels[1] == NULL || + conn->channels[2] == NULL || + conn->channels[3] == NULL); + atomic_inc(&conn->usage); + break; + } + + if (!list_empty(&bundle->unused_conns)) { + _debug("unused"); + conn = list_entry(bundle->unused_conns.next, + struct rxrpc_connection, + bundle_link); + if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { + list_del_init(&conn->bundle_link); + bundle->num_conns--; + continue; + } + ASSERTCMP(conn->avail_calls, ==, RXRPC_MAXCALLS); + conn->avail_calls = RXRPC_MAXCALLS - 1; + ASSERT(conn->channels[0] == NULL && + conn->channels[1] == NULL && + conn->channels[2] == NULL && + conn->channels[3] == NULL); + atomic_inc(&conn->usage); + list_move(&conn->bundle_link, &bundle->avail_conns); + break; + } + + /* need to allocate a new connection */ + _debug("get new conn [%d]", bundle->num_conns); + + spin_unlock(&trans->client_lock); + + if (signal_pending(current)) + goto interrupted; + + if (bundle->num_conns >= 20) { + _debug("too many conns"); + + if (!gfpflags_allow_blocking(gfp)) { + _leave(" = -EAGAIN"); + return -EAGAIN; + } + + add_wait_queue(&bundle->chanwait, &myself); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (bundle->num_conns < 20 || + !list_empty(&bundle->unused_conns) || + !list_empty(&bundle->avail_conns)) + break; + if (signal_pending(current)) + goto interrupted_dequeue; + schedule(); + } + remove_wait_queue(&bundle->chanwait, &myself); + __set_current_state(TASK_RUNNING); + spin_lock(&trans->client_lock); + continue; + } + + /* not yet present - create a candidate for a new connection and then + * redo the check */ + candidate = rxrpc_alloc_connection(gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + candidate->trans = trans; + candidate->bundle = bundle; + candidate->service_id = bundle->service_id; + candidate->epoch = rxrpc_epoch; + candidate->in_clientflag = 0; + candidate->out_clientflag = RXRPC_CLIENT_INITIATED; + candidate->cid = 0; + candidate->state = RXRPC_CONN_CLIENT; + candidate->avail_calls = RXRPC_MAXCALLS; + candidate->security_level = rx->min_sec_level; + candidate->key = key_get(bundle->key); + + ret = rxrpc_init_client_conn_security(candidate); + if (ret < 0) { + key_put(candidate->key); + kfree(candidate); + _leave(" = %d [key]", ret); + return ret; + } + + write_lock_bh(&rxrpc_connection_lock); + list_add_tail(&candidate->link, &rxrpc_connections); + write_unlock_bh(&rxrpc_connection_lock); + + spin_lock(&trans->client_lock); + + list_add(&candidate->bundle_link, &bundle->unused_conns); + bundle->num_conns++; + atomic_inc(&bundle->usage); + atomic_inc(&trans->usage); + + _net("CONNECT new %d on TRANS %d", + candidate->debug_id, candidate->trans->debug_id); + + rxrpc_assign_connection_id(candidate); + candidate->security->prime_packet_security(candidate); + + /* leave the candidate lurking in zombie mode attached to the + * bundle until we're ready for it */ + rxrpc_put_connection(candidate); + candidate = NULL; + } + + /* we've got a connection with a free channel and we can now attach the + * call to it + * - we're holding the transport's client lock + * - we're holding a reference on the connection + * - we're holding a reference on the bundle + */ + for (chan = 0; chan < RXRPC_MAXCALLS; chan++) + if (!conn->channels[chan]) + goto found_channel; + ASSERT(conn->channels[0] == NULL || + conn->channels[1] == NULL || + conn->channels[2] == NULL || + conn->channels[3] == NULL); + BUG(); + +found_channel: + conn->channels[chan] = call; + call->conn = conn; + call->channel = chan; + call->cid = conn->cid | chan; + call->call_id = ++conn->call_counter; + + _net("CONNECT client on conn %d chan %d as call %x", + conn->debug_id, chan, call->call_id); + + ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); + spin_unlock(&trans->client_lock); + + rxrpc_add_call_ID_to_conn(conn, call); + + _leave(" = 0"); + return 0; + +interrupted_dequeue: + remove_wait_queue(&bundle->chanwait, &myself); + __set_current_state(TASK_RUNNING); +interrupted: + _leave(" = -ERESTARTSYS"); + return -ERESTARTSYS; +} + +/* + * get a record of an incoming connection + */ +struct rxrpc_connection * +rxrpc_incoming_connection(struct rxrpc_transport *trans, + struct rxrpc_host_header *hdr) +{ + struct rxrpc_connection *conn, *candidate = NULL; + struct rb_node *p, **pp; + const char *new = "old"; + __be32 epoch; + u32 cid; + + _enter(""); + + ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED); + + epoch = hdr->epoch; + cid = hdr->cid & RXRPC_CIDMASK; + + /* search the connection list first */ + read_lock_bh(&trans->conn_lock); + + p = trans->server_conns.rb_node; + while (p) { + conn = rb_entry(p, struct rxrpc_connection, node); + + _debug("maybe %x", conn->cid); + + if (epoch < conn->epoch) + p = p->rb_left; + else if (epoch > conn->epoch) + p = p->rb_right; + else if (cid < conn->cid) + p = p->rb_left; + else if (cid > conn->cid) + p = p->rb_right; + else + goto found_extant_connection; + } + read_unlock_bh(&trans->conn_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_connection(GFP_NOIO); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + candidate->trans = trans; + candidate->epoch = hdr->epoch; + candidate->cid = hdr->cid & RXRPC_CIDMASK; + candidate->service_id = hdr->serviceId; + candidate->security_ix = hdr->securityIndex; + candidate->in_clientflag = RXRPC_CLIENT_INITIATED; + candidate->out_clientflag = 0; + candidate->state = RXRPC_CONN_SERVER; + if (candidate->service_id) + candidate->state = RXRPC_CONN_SERVER_UNSECURED; + + write_lock_bh(&trans->conn_lock); + + pp = &trans->server_conns.rb_node; + p = NULL; + while (*pp) { + p = *pp; + conn = rb_entry(p, struct rxrpc_connection, node); + + if (epoch < conn->epoch) + pp = &(*pp)->rb_left; + else if (epoch > conn->epoch) + pp = &(*pp)->rb_right; + else if (cid < conn->cid) + pp = &(*pp)->rb_left; + else if (cid > conn->cid) + pp = &(*pp)->rb_right; + else + goto found_extant_second; + } + + /* we can now add the new candidate to the list */ + conn = candidate; + candidate = NULL; + rb_link_node(&conn->node, p, pp); + rb_insert_color(&conn->node, &trans->server_conns); + atomic_inc(&conn->trans->usage); + + write_unlock_bh(&trans->conn_lock); + + write_lock_bh(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + write_unlock_bh(&rxrpc_connection_lock); + + new = "new"; + +success: + _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->cid); + + _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); + return conn; + + /* we found the connection in the list immediately */ +found_extant_connection: + if (hdr->securityIndex != conn->security_ix) { + read_unlock_bh(&trans->conn_lock); + goto security_mismatch; + } + atomic_inc(&conn->usage); + read_unlock_bh(&trans->conn_lock); + goto success; + + /* we found the connection on the second time through the list */ +found_extant_second: + if (hdr->securityIndex != conn->security_ix) { + write_unlock_bh(&trans->conn_lock); + goto security_mismatch; + } + atomic_inc(&conn->usage); + write_unlock_bh(&trans->conn_lock); + kfree(candidate); + goto success; + +security_mismatch: + kfree(candidate); + _leave(" = -EKEYREJECTED"); + return ERR_PTR(-EKEYREJECTED); +} + +/* + * find a connection based on transport and RxRPC connection ID for an incoming + * packet + */ +struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, + struct rxrpc_host_header *hdr) +{ + struct rxrpc_connection *conn; + struct rb_node *p; + u32 epoch, cid; + + _enter(",{%x,%x}", hdr->cid, hdr->flags); + + read_lock_bh(&trans->conn_lock); + + cid = hdr->cid & RXRPC_CIDMASK; + epoch = hdr->epoch; + + if (hdr->flags & RXRPC_CLIENT_INITIATED) + p = trans->server_conns.rb_node; + else + p = trans->client_conns.rb_node; + + while (p) { + conn = rb_entry(p, struct rxrpc_connection, node); + + _debug("maybe %x", conn->cid); + + if (epoch < conn->epoch) + p = p->rb_left; + else if (epoch > conn->epoch) + p = p->rb_right; + else if (cid < conn->cid) + p = p->rb_left; + else if (cid > conn->cid) + p = p->rb_right; + else + goto found; + } + + read_unlock_bh(&trans->conn_lock); + _leave(" = NULL"); + return NULL; + +found: + atomic_inc(&conn->usage); + read_unlock_bh(&trans->conn_lock); + _leave(" = %p", conn); + return conn; +} + +/* + * release a virtual connection + */ +void rxrpc_put_connection(struct rxrpc_connection *conn) +{ + _enter("%p{u=%d,d=%d}", + conn, atomic_read(&conn->usage), conn->debug_id); + + ASSERTCMP(atomic_read(&conn->usage), >, 0); + + conn->put_time = ktime_get_seconds(); + if (atomic_dec_and_test(&conn->usage)) { + _debug("zombie"); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + } + + _leave(""); +} + +/* + * destroy a virtual connection + */ +static void rxrpc_destroy_connection(struct rxrpc_connection *conn) +{ + _enter("%p{%d}", conn, atomic_read(&conn->usage)); + + ASSERTCMP(atomic_read(&conn->usage), ==, 0); + + _net("DESTROY CONN %d", conn->debug_id); + + if (conn->bundle) + rxrpc_put_bundle(conn->trans, conn->bundle); + + ASSERT(RB_EMPTY_ROOT(&conn->calls)); + rxrpc_purge_queue(&conn->rx_queue); + + conn->security->clear(conn); + key_put(conn->key); + key_put(conn->server_key); + + rxrpc_put_transport(conn->trans); + kfree(conn); + _leave(""); +} + +/* + * reap dead connections + */ +static void rxrpc_connection_reaper(struct work_struct *work) +{ + struct rxrpc_connection *conn, *_p; + unsigned long now, earliest, reap_time; + + LIST_HEAD(graveyard); + + _enter(""); + + now = ktime_get_seconds(); + earliest = ULONG_MAX; + + write_lock_bh(&rxrpc_connection_lock); + list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) { + _debug("reap CONN %d { u=%d,t=%ld }", + conn->debug_id, atomic_read(&conn->usage), + (long) now - (long) conn->put_time); + + if (likely(atomic_read(&conn->usage) > 0)) + continue; + + spin_lock(&conn->trans->client_lock); + write_lock(&conn->trans->conn_lock); + reap_time = conn->put_time + rxrpc_connection_expiry; + + if (atomic_read(&conn->usage) > 0) { + ; + } else if (reap_time <= now) { + list_move_tail(&conn->link, &graveyard); + if (conn->out_clientflag) + rb_erase(&conn->node, + &conn->trans->client_conns); + else + rb_erase(&conn->node, + &conn->trans->server_conns); + if (conn->bundle) { + list_del_init(&conn->bundle_link); + conn->bundle->num_conns--; + } + + } else if (reap_time < earliest) { + earliest = reap_time; + } + + write_unlock(&conn->trans->conn_lock); + spin_unlock(&conn->trans->client_lock); + } + write_unlock_bh(&rxrpc_connection_lock); + + if (earliest != ULONG_MAX) { + _debug("reschedule reaper %ld", (long) earliest - now); + ASSERTCMP(earliest, >, now); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, + (earliest - now) * HZ); + } + + /* then destroy all those pulled out */ + while (!list_empty(&graveyard)) { + conn = list_entry(graveyard.next, struct rxrpc_connection, + link); + list_del_init(&conn->link); + + ASSERTCMP(atomic_read(&conn->usage), ==, 0); + rxrpc_destroy_connection(conn); + } + + _leave(""); +} + +/* + * preemptively destroy all the connection records rather than waiting for them + * to time out + */ +void __exit rxrpc_destroy_all_connections(void) +{ + _enter(""); + + rxrpc_connection_expiry = 0; + cancel_delayed_work(&rxrpc_connection_reap); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + + _leave(""); +} diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c new file mode 100644 index 000000000000..e0815a033999 --- /dev/null +++ b/net/rxrpc/input.c @@ -0,0 +1,800 @@ +/* RxRPC packet reception + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * queue a packet for recvmsg to pass to userspace + * - the caller must hold a lock on call->lock + * - must not be called with interrupts disabled (sk_filter() disables BH's) + * - eats the packet whether successful or not + * - there must be just one reference to the packet, which the caller passes to + * this function + */ +int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, + bool force, bool terminal) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_sock *rx = call->socket; + struct sock *sk; + int ret; + + _enter(",,%d,%d", force, terminal); + + ASSERT(!irqs_disabled()); + + sp = rxrpc_skb(skb); + ASSERTCMP(sp->call, ==, call); + + /* if we've already posted the terminal message for a call, then we + * don't post any more */ + if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { + _debug("already terminated"); + ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE); + skb->destructor = NULL; + sp->call = NULL; + rxrpc_put_call(call); + rxrpc_free_skb(skb); + return 0; + } + + sk = &rx->sk; + + if (!force) { + /* cast skb->rcvbuf to unsigned... It's pointless, but + * reduces number of warnings when compiling with -W + * --ANK */ +// ret = -ENOBUFS; +// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= +// (unsigned int) sk->sk_rcvbuf) +// goto out; + + ret = sk_filter(sk, skb); + if (ret < 0) + goto out; + } + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) && + !test_bit(RXRPC_CALL_RELEASED, &call->flags) && + call->socket->sk.sk_state != RXRPC_CLOSE) { + skb->destructor = rxrpc_packet_destructor; + skb->dev = NULL; + skb->sk = sk; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + + if (terminal) { + _debug("<<<< TERMINAL MESSAGE >>>>"); + set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags); + } + + /* allow interception by a kernel service */ + if (rx->interceptor) { + rx->interceptor(sk, call->user_call_ID, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + } else { + _net("post skb %p", skb); + __skb_queue_tail(&sk->sk_receive_queue, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + } + skb = NULL; + } else { + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + ret = 0; + +out: + /* release the socket buffer */ + if (skb) { + skb->destructor = NULL; + sp->call = NULL; + rxrpc_put_call(call); + rxrpc_free_skb(skb); + } + + _leave(" = %d", ret); + return ret; +} + +/* + * process a DATA packet, posting the packet to the appropriate queue + * - eats the packet if successful + */ +static int rxrpc_fast_process_data(struct rxrpc_call *call, + struct sk_buff *skb, u32 seq) +{ + struct rxrpc_skb_priv *sp; + bool terminal; + int ret, ackbit, ack; + + _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq); + + sp = rxrpc_skb(skb); + ASSERTCMP(sp->call, ==, NULL); + + spin_lock(&call->lock); + + if (call->state > RXRPC_CALL_COMPLETE) + goto discard; + + ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post); + ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv); + ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten); + + if (seq < call->rx_data_post) { + _debug("dup #%u [-%u]", seq, call->rx_data_post); + ack = RXRPC_ACK_DUPLICATE; + ret = -ENOBUFS; + goto discard_and_ack; + } + + /* we may already have the packet in the out of sequence queue */ + ackbit = seq - (call->rx_data_eaten + 1); + ASSERTCMP(ackbit, >=, 0); + if (__test_and_set_bit(ackbit, call->ackr_window)) { + _debug("dup oos #%u [%u,%u]", + seq, call->rx_data_eaten, call->rx_data_post); + ack = RXRPC_ACK_DUPLICATE; + goto discard_and_ack; + } + + if (seq >= call->ackr_win_top) { + _debug("exceed #%u [%u]", seq, call->ackr_win_top); + __clear_bit(ackbit, call->ackr_window); + ack = RXRPC_ACK_EXCEEDS_WINDOW; + goto discard_and_ack; + } + + if (seq == call->rx_data_expect) { + clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags); + call->rx_data_expect++; + } else if (seq > call->rx_data_expect) { + _debug("oos #%u [%u]", seq, call->rx_data_expect); + call->rx_data_expect = seq + 1; + if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) { + ack = RXRPC_ACK_OUT_OF_SEQUENCE; + goto enqueue_and_ack; + } + goto enqueue_packet; + } + + if (seq != call->rx_data_post) { + _debug("ahead #%u [%u]", seq, call->rx_data_post); + goto enqueue_packet; + } + + if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags)) + goto protocol_error; + + /* if the packet need security things doing to it, then it goes down + * the slow path */ + if (call->conn->security_ix) + goto enqueue_packet; + + sp->call = call; + rxrpc_get_call(call); + terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && + !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); + ret = rxrpc_queue_rcv_skb(call, skb, false, terminal); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOBUFS) { + __clear_bit(ackbit, call->ackr_window); + ack = RXRPC_ACK_NOSPACE; + goto discard_and_ack; + } + goto out; + } + + skb = NULL; + + _debug("post #%u", seq); + ASSERTCMP(call->rx_data_post, ==, seq); + call->rx_data_post++; + + if (sp->hdr.flags & RXRPC_LAST_PACKET) + set_bit(RXRPC_CALL_RCVD_LAST, &call->flags); + + /* if we've reached an out of sequence packet then we need to drain + * that queue into the socket Rx queue now */ + if (call->rx_data_post == call->rx_first_oos) { + _debug("drain rx oos now"); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + } + + spin_unlock(&call->lock); + atomic_inc(&call->ackr_not_idle); + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, sp->hdr.serial, false); + _leave(" = 0 [posted]"); + return 0; + +protocol_error: + ret = -EBADMSG; +out: + spin_unlock(&call->lock); + _leave(" = %d", ret); + return ret; + +discard_and_ack: + _debug("discard and ACK packet %p", skb); + __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true); +discard: + spin_unlock(&call->lock); + rxrpc_free_skb(skb); + _leave(" = 0 [discarded]"); + return 0; + +enqueue_and_ack: + __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true); +enqueue_packet: + _net("defer skb %p", skb); + spin_unlock(&call->lock); + skb_queue_tail(&call->rx_queue, skb); + atomic_inc(&call->ackr_not_idle); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + _leave(" = 0 [queued]"); + return 0; +} + +/* + * assume an implicit ACKALL of the transmission phase of a client socket upon + * reception of the first reply packet + */ +static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) +{ + write_lock_bh(&call->state_lock); + + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + call->acks_latest = serial; + + _debug("implicit ACKALL %%%u", call->acks_latest); + set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events); + write_unlock_bh(&call->state_lock); + + if (try_to_del_timer_sync(&call->resend_timer) >= 0) { + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_RESEND, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + break; + + default: + write_unlock_bh(&call->state_lock); + break; + } +} + +/* + * post an incoming packet to the nominated call to deal with + * - must get rid of the sk_buff, either by freeing it or by queuing it + */ +void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + __be32 wtmp; + u32 hi_serial, abort_code; + + _enter("%p,%p", call, skb); + + ASSERT(!irqs_disabled()); + +#if 0 // INJECT RX ERROR + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { + static int skip = 0; + if (++skip == 3) { + printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n"); + skip = 0; + goto free_packet; + } + } +#endif + + /* track the latest serial number on this connection for ACK packet + * information */ + hi_serial = atomic_read(&call->conn->hi_serial); + while (sp->hdr.serial > hi_serial) + hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial, + sp->hdr.serial); + + /* request ACK generation for any ACK or DATA packet that requests + * it */ + if (sp->hdr.flags & RXRPC_REQUEST_ACK) { + _proto("ACK Requested on %%%u", sp->hdr.serial); + rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false); + } + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_ABORT: + _debug("abort"); + + if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) + goto protocol_error; + + abort_code = ntohl(wtmp); + _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); + + write_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_REMOTELY_ABORTED; + call->remote_abort = abort_code; + set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); + rxrpc_queue_call(call); + } + goto free_packet_unlock; + + case RXRPC_PACKET_TYPE_BUSY: + _proto("Rx BUSY %%%u", sp->hdr.serial); + + if (call->conn->out_clientflag) + goto protocol_error; + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + call->state = RXRPC_CALL_SERVER_BUSY; + set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); + rxrpc_queue_call(call); + case RXRPC_CALL_SERVER_BUSY: + goto free_packet_unlock; + default: + goto protocol_error_locked; + } + + default: + _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); + goto protocol_error; + + case RXRPC_PACKET_TYPE_DATA: + _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); + + if (sp->hdr.seq == 0) + goto protocol_error; + + call->ackr_prev_seq = sp->hdr.seq; + + /* received data implicitly ACKs all of the request packets we + * sent when we're acting as a client */ + if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) + rxrpc_assume_implicit_ackall(call, sp->hdr.serial); + + switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) { + case 0: + skb = NULL; + goto done; + + default: + BUG(); + + /* data packet received beyond the last packet */ + case -EBADMSG: + goto protocol_error; + } + + case RXRPC_PACKET_TYPE_ACKALL: + case RXRPC_PACKET_TYPE_ACK: + /* ACK processing is done in process context */ + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) { + skb_queue_tail(&call->rx_queue, skb); + rxrpc_queue_call(call); + skb = NULL; + } + read_unlock_bh(&call->state_lock); + goto free_packet; + } + +protocol_error: + _debug("protocol error"); + write_lock_bh(&call->state_lock); +protocol_error_locked: + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = RX_PROTOCOL_ERROR; + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + rxrpc_queue_call(call); + } +free_packet_unlock: + write_unlock_bh(&call->state_lock); +free_packet: + rxrpc_free_skb(skb); +done: + _leave(""); +} + +/* + * split up a jumbo data packet + */ +static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, + struct sk_buff *jumbo) +{ + struct rxrpc_jumbo_header jhdr; + struct rxrpc_skb_priv *sp; + struct sk_buff *part; + + _enter(",{%u,%u}", jumbo->data_len, jumbo->len); + + sp = rxrpc_skb(jumbo); + + do { + sp->hdr.flags &= ~RXRPC_JUMBO_PACKET; + + /* make a clone to represent the first subpacket in what's left + * of the jumbo packet */ + part = skb_clone(jumbo, GFP_ATOMIC); + if (!part) { + /* simply ditch the tail in the event of ENOMEM */ + pskb_trim(jumbo, RXRPC_JUMBO_DATALEN); + break; + } + rxrpc_new_skb(part); + + pskb_trim(part, RXRPC_JUMBO_DATALEN); + + if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN)) + goto protocol_error; + + if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0) + goto protocol_error; + if (!pskb_pull(jumbo, sizeof(jhdr))) + BUG(); + + sp->hdr.seq += 1; + sp->hdr.serial += 1; + sp->hdr.flags = jhdr.flags; + sp->hdr._rsvd = jhdr._rsvd; + + _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1); + + rxrpc_fast_process_packet(call, part); + part = NULL; + + } while (sp->hdr.flags & RXRPC_JUMBO_PACKET); + + rxrpc_fast_process_packet(call, jumbo); + _leave(""); + return; + +protocol_error: + _debug("protocol error"); + rxrpc_free_skb(part); + rxrpc_free_skb(jumbo); + write_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = RX_PROTOCOL_ERROR; + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + rxrpc_queue_call(call); + } + write_unlock_bh(&call->state_lock); + _leave(""); +} + +/* + * post an incoming packet to the appropriate call/socket to deal with + * - must get rid of the sk_buff, either by freeing it or by queuing it + */ +static void rxrpc_post_packet_to_call(struct rxrpc_call *call, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp; + + _enter("%p,%p", call, skb); + + sp = rxrpc_skb(skb); + + _debug("extant call [%d]", call->state); + + read_lock(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_LOCALLY_ABORTED: + if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) { + rxrpc_queue_call(call); + goto free_unlock; + } + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_NETWORK_ERROR: + case RXRPC_CALL_DEAD: + goto dead_call; + case RXRPC_CALL_COMPLETE: + case RXRPC_CALL_CLIENT_FINAL_ACK: + /* complete server call */ + if (call->conn->in_clientflag) + goto dead_call; + /* resend last packet of a completed call */ + _debug("final ack again"); + rxrpc_get_call(call); + set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); + rxrpc_queue_call(call); + goto free_unlock; + default: + break; + } + + read_unlock(&call->state_lock); + rxrpc_get_call(call); + + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + sp->hdr.flags & RXRPC_JUMBO_PACKET) + rxrpc_process_jumbo_packet(call, skb); + else + rxrpc_fast_process_packet(call, skb); + + rxrpc_put_call(call); + goto done; + +dead_call: + if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { + skb->priority = RX_CALL_DEAD; + rxrpc_reject_packet(call->conn->trans->local, skb); + goto unlock; + } +free_unlock: + rxrpc_free_skb(skb); +unlock: + read_unlock(&call->state_lock); +done: + _leave(""); +} + +/* + * post connection-level events to the connection + * - this includes challenges, responses and some aborts + */ +static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + _enter("%p,%p", conn, skb); + + atomic_inc(&conn->usage); + skb_queue_tail(&conn->rx_queue, skb); + rxrpc_queue_conn(conn); +} + +/* + * post endpoint-level events to the local endpoint + * - this includes debug and version messages + */ +static void rxrpc_post_packet_to_local(struct rxrpc_local *local, + struct sk_buff *skb) +{ + _enter("%p,%p", local, skb); + + atomic_inc(&local->usage); + skb_queue_tail(&local->event_queue, skb); + rxrpc_queue_work(&local->event_processor); +} + +/* + * Extract the wire header from a packet and translate the byte order. + */ +static noinline +int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + + /* dig out the RxRPC connection details */ + if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) + return -EBADMSG; + if (!pskb_pull(skb, sizeof(whdr))) + BUG(); + + memset(sp, 0, sizeof(*sp)); + sp->hdr.epoch = ntohl(whdr.epoch); + sp->hdr.cid = ntohl(whdr.cid); + sp->hdr.callNumber = ntohl(whdr.callNumber); + sp->hdr.seq = ntohl(whdr.seq); + sp->hdr.serial = ntohl(whdr.serial); + sp->hdr.flags = whdr.flags; + sp->hdr.type = whdr.type; + sp->hdr.userStatus = whdr.userStatus; + sp->hdr.securityIndex = whdr.securityIndex; + sp->hdr._rsvd = ntohs(whdr._rsvd); + sp->hdr.serviceId = ntohs(whdr.serviceId); + return 0; +} + +static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, + struct sk_buff *skb, + struct rxrpc_skb_priv *sp) +{ + struct rxrpc_peer *peer; + struct rxrpc_transport *trans; + struct rxrpc_connection *conn; + + peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr, + udp_hdr(skb)->source); + if (IS_ERR(peer)) + goto cant_find_conn; + + trans = rxrpc_find_transport(local, peer); + rxrpc_put_peer(peer); + if (!trans) + goto cant_find_conn; + + conn = rxrpc_find_connection(trans, &sp->hdr); + rxrpc_put_transport(trans); + if (!conn) + goto cant_find_conn; + + return conn; +cant_find_conn: + return NULL; +} + +/* + * handle data received on the local endpoint + * - may be called in interrupt context + */ +void rxrpc_data_ready(struct sock *sk) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_local *local; + struct sk_buff *skb; + int ret; + + _enter("%p", sk); + + ASSERT(!irqs_disabled()); + + read_lock_bh(&rxrpc_local_lock); + local = sk->sk_user_data; + if (local && atomic_read(&local->usage) > 0) + rxrpc_get_local(local); + else + local = NULL; + read_unlock_bh(&rxrpc_local_lock); + if (!local) { + _leave(" [local dead]"); + return; + } + + skb = skb_recv_datagram(sk, 0, 1, &ret); + if (!skb) { + rxrpc_put_local(local); + if (ret == -EAGAIN) + return; + _debug("UDP socket error %d", ret); + return; + } + + rxrpc_new_skb(skb); + + _net("recv skb %p", skb); + + /* we'll probably need to checksum it (didn't call sock_recvmsg) */ + if (skb_checksum_complete(skb)) { + rxrpc_free_skb(skb); + rxrpc_put_local(local); + __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0); + _leave(" [CSUM failed]"); + return; + } + + __UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0); + + /* The socket buffer we have is owned by UDP, with UDP's data all over + * it, but we really want our own data there. + */ + skb_orphan(skb); + sp = rxrpc_skb(skb); + + _net("Rx UDP packet from %08x:%04hu", + ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source)); + + /* dig out the RxRPC connection details */ + if (rxrpc_extract_header(sp, skb) < 0) + goto bad_message; + + _net("Rx RxRPC %s ep=%x call=%x:%x", + sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", + sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber); + + if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || + !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { + _proto("Rx Bad Packet Type %u", sp->hdr.type); + goto bad_message; + } + + if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) { + rxrpc_post_packet_to_local(local, skb); + goto out; + } + + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) + goto bad_message; + + if (sp->hdr.callNumber == 0) { + /* This is a connection-level packet. These should be + * fairly rare, so the extra overhead of looking them up the + * old-fashioned way doesn't really hurt */ + struct rxrpc_connection *conn; + + conn = rxrpc_conn_from_local(local, skb, sp); + if (!conn) + goto cant_route_call; + + _debug("CONN %p {%d}", conn, conn->debug_id); + rxrpc_post_packet_to_conn(conn, skb); + rxrpc_put_connection(conn); + } else { + struct rxrpc_call *call; + + call = rxrpc_find_call_hash(&sp->hdr, local, + AF_INET, &ip_hdr(skb)->saddr); + if (call) + rxrpc_post_packet_to_call(call, skb); + else + goto cant_route_call; + } + +out: + rxrpc_put_local(local); + return; + +cant_route_call: + _debug("can't route call"); + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && + sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { + if (sp->hdr.seq == 1) { + _debug("first packet"); + skb_queue_tail(&local->accept_queue, skb); + rxrpc_queue_work(&local->acceptor); + rxrpc_put_local(local); + _leave(" [incoming]"); + return; + } + skb->priority = RX_INVALID_OPERATION; + } else { + skb->priority = RX_CALL_DEAD; + } + + if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { + _debug("reject type %d",sp->hdr.type); + rxrpc_reject_packet(local, skb); + } + rxrpc_put_local(local); + _leave(" [no call]"); + return; + +bad_message: + skb->priority = RX_PROTOCOL_ERROR; + rxrpc_reject_packet(local, skb); + rxrpc_put_local(local); + _leave(" [badmsg]"); +} diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c new file mode 100644 index 000000000000..4ad56fafe3a7 --- /dev/null +++ b/net/rxrpc/key.c @@ -0,0 +1,1237 @@ +/* RxRPC key management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * RxRPC keys should have a description of describing their purpose: + * "afs@CAMBRIDGE.REDHAT.COM> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static int rxrpc_vet_description_s(const char *); +static int rxrpc_preparse(struct key_preparsed_payload *); +static int rxrpc_preparse_s(struct key_preparsed_payload *); +static void rxrpc_free_preparse(struct key_preparsed_payload *); +static void rxrpc_free_preparse_s(struct key_preparsed_payload *); +static void rxrpc_destroy(struct key *); +static void rxrpc_destroy_s(struct key *); +static void rxrpc_describe(const struct key *, struct seq_file *); +static long rxrpc_read(const struct key *, char __user *, size_t); + +/* + * rxrpc defined keys take an arbitrary string as the description and an + * arbitrary blob of data as the payload + */ +struct key_type key_type_rxrpc = { + .name = "rxrpc", + .preparse = rxrpc_preparse, + .free_preparse = rxrpc_free_preparse, + .instantiate = generic_key_instantiate, + .destroy = rxrpc_destroy, + .describe = rxrpc_describe, + .read = rxrpc_read, +}; +EXPORT_SYMBOL(key_type_rxrpc); + +/* + * rxrpc server defined keys take ":" as the + * description and an 8-byte decryption key as the payload + */ +struct key_type key_type_rxrpc_s = { + .name = "rxrpc_s", + .vet_description = rxrpc_vet_description_s, + .preparse = rxrpc_preparse_s, + .free_preparse = rxrpc_free_preparse_s, + .instantiate = generic_key_instantiate, + .destroy = rxrpc_destroy_s, + .describe = rxrpc_describe, +}; + +/* + * Vet the description for an RxRPC server key + */ +static int rxrpc_vet_description_s(const char *desc) +{ + unsigned long num; + char *p; + + num = simple_strtoul(desc, &p, 10); + if (*p != ':' || num > 65535) + return -EINVAL; + num = simple_strtoul(p + 1, &p, 10); + if (*p || num < 1 || num > 255) + return -EINVAL; + return 0; +} + +/* + * parse an RxKAD type XDR format token + * - the caller guarantees we have at least 4 words + */ +static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, + size_t datalen, + const __be32 *xdr, unsigned int toklen) +{ + struct rxrpc_key_token *token, **pptoken; + size_t plen; + u32 tktlen; + + _enter(",{%x,%x,%x,%x},%u", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + if (toklen <= 8 * 4) + return -EKEYREJECTED; + tktlen = ntohl(xdr[7]); + _debug("tktlen: %x", tktlen); + if (tktlen > AFSTOKEN_RK_TIX_MAX) + return -EKEYREJECTED; + if (toklen < 8 * 4 + tktlen) + return -EKEYREJECTED; + + plen = sizeof(*token) + sizeof(*token->kad) + tktlen; + prep->quotalen = datalen + plen; + + plen -= sizeof(*token); + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + return -ENOMEM; + + token->kad = kzalloc(plen, GFP_KERNEL); + if (!token->kad) { + kfree(token); + return -ENOMEM; + } + + token->security_index = RXRPC_SECURITY_RXKAD; + token->kad->ticket_len = tktlen; + token->kad->vice_id = ntohl(xdr[0]); + token->kad->kvno = ntohl(xdr[1]); + token->kad->start = ntohl(xdr[4]); + token->kad->expiry = ntohl(xdr[5]); + token->kad->primary_flag = ntohl(xdr[6]); + memcpy(&token->kad->session_key, &xdr[2], 8); + memcpy(&token->kad->ticket, &xdr[8], tktlen); + + _debug("SCIX: %u", token->security_index); + _debug("TLEN: %u", token->kad->ticket_len); + _debug("EXPY: %x", token->kad->expiry); + _debug("KVNO: %u", token->kad->kvno); + _debug("PRIM: %u", token->kad->primary_flag); + _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", + token->kad->session_key[0], token->kad->session_key[1], + token->kad->session_key[2], token->kad->session_key[3], + token->kad->session_key[4], token->kad->session_key[5], + token->kad->session_key[6], token->kad->session_key[7]); + if (token->kad->ticket_len >= 8) + _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", + token->kad->ticket[0], token->kad->ticket[1], + token->kad->ticket[2], token->kad->ticket[3], + token->kad->ticket[4], token->kad->ticket[5], + token->kad->ticket[6], token->kad->ticket[7]); + + /* count the number of tokens attached */ + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); + + /* attach the data */ + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + if (token->kad->expiry < prep->expiry) + prep->expiry = token->kad->expiry; + + _leave(" = 0"); + return 0; +} + +static void rxrpc_free_krb5_principal(struct krb5_principal *princ) +{ + int loop; + + if (princ->name_parts) { + for (loop = princ->n_name_parts - 1; loop >= 0; loop--) + kfree(princ->name_parts[loop]); + kfree(princ->name_parts); + } + kfree(princ->realm); +} + +static void rxrpc_free_krb5_tagged(struct krb5_tagged_data *td) +{ + kfree(td->data); +} + +/* + * free up an RxK5 token + */ +static void rxrpc_rxk5_free(struct rxk5_key *rxk5) +{ + int loop; + + rxrpc_free_krb5_principal(&rxk5->client); + rxrpc_free_krb5_principal(&rxk5->server); + rxrpc_free_krb5_tagged(&rxk5->session); + + if (rxk5->addresses) { + for (loop = rxk5->n_addresses - 1; loop >= 0; loop--) + rxrpc_free_krb5_tagged(&rxk5->addresses[loop]); + kfree(rxk5->addresses); + } + if (rxk5->authdata) { + for (loop = rxk5->n_authdata - 1; loop >= 0; loop--) + rxrpc_free_krb5_tagged(&rxk5->authdata[loop]); + kfree(rxk5->authdata); + } + + kfree(rxk5->ticket); + kfree(rxk5->ticket2); + kfree(rxk5); +} + +/* + * extract a krb5 principal + */ +static int rxrpc_krb5_decode_principal(struct krb5_principal *princ, + const __be32 **_xdr, + unsigned int *_toklen) +{ + const __be32 *xdr = *_xdr; + unsigned int toklen = *_toklen, n_parts, loop, tmp; + + /* there must be at least one name, and at least #names+1 length + * words */ + if (toklen <= 12) + return -EINVAL; + + _enter(",{%x,%x,%x},%u", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), toklen); + + n_parts = ntohl(*xdr++); + toklen -= 4; + if (n_parts <= 0 || n_parts > AFSTOKEN_K5_COMPONENTS_MAX) + return -EINVAL; + princ->n_name_parts = n_parts; + + if (toklen <= (n_parts + 1) * 4) + return -EINVAL; + + princ->name_parts = kcalloc(n_parts, sizeof(char *), GFP_KERNEL); + if (!princ->name_parts) + return -ENOMEM; + + for (loop = 0; loop < n_parts; loop++) { + if (toklen < 4) + return -EINVAL; + tmp = ntohl(*xdr++); + toklen -= 4; + if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX) + return -EINVAL; + if (tmp > toklen) + return -EINVAL; + princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL); + if (!princ->name_parts[loop]) + return -ENOMEM; + memcpy(princ->name_parts[loop], xdr, tmp); + princ->name_parts[loop][tmp] = 0; + tmp = (tmp + 3) & ~3; + toklen -= tmp; + xdr += tmp >> 2; + } + + if (toklen < 4) + return -EINVAL; + tmp = ntohl(*xdr++); + toklen -= 4; + if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX) + return -EINVAL; + if (tmp > toklen) + return -EINVAL; + princ->realm = kmalloc(tmp + 1, GFP_KERNEL); + if (!princ->realm) + return -ENOMEM; + memcpy(princ->realm, xdr, tmp); + princ->realm[tmp] = 0; + tmp = (tmp + 3) & ~3; + toklen -= tmp; + xdr += tmp >> 2; + + _debug("%s/...@%s", princ->name_parts[0], princ->realm); + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * extract a piece of krb5 tagged data + */ +static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td, + size_t max_data_size, + const __be32 **_xdr, + unsigned int *_toklen) +{ + const __be32 *xdr = *_xdr; + unsigned int toklen = *_toklen, len; + + /* there must be at least one tag and one length word */ + if (toklen <= 8) + return -EINVAL; + + _enter(",%zu,{%x,%x},%u", + max_data_size, ntohl(xdr[0]), ntohl(xdr[1]), toklen); + + td->tag = ntohl(*xdr++); + len = ntohl(*xdr++); + toklen -= 8; + if (len > max_data_size) + return -EINVAL; + td->data_len = len; + + if (len > 0) { + td->data = kmemdup(xdr, len, GFP_KERNEL); + if (!td->data) + return -ENOMEM; + len = (len + 3) & ~3; + toklen -= len; + xdr += len >> 2; + } + + _debug("tag %x len %x", td->tag, td->data_len); + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * extract an array of tagged data + */ +static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td, + u8 *_n_elem, + u8 max_n_elem, + size_t max_elem_size, + const __be32 **_xdr, + unsigned int *_toklen) +{ + struct krb5_tagged_data *td; + const __be32 *xdr = *_xdr; + unsigned int toklen = *_toklen, n_elem, loop; + int ret; + + /* there must be at least one count */ + if (toklen < 4) + return -EINVAL; + + _enter(",,%u,%zu,{%x},%u", + max_n_elem, max_elem_size, ntohl(xdr[0]), toklen); + + n_elem = ntohl(*xdr++); + toklen -= 4; + if (n_elem > max_n_elem) + return -EINVAL; + *_n_elem = n_elem; + if (n_elem > 0) { + if (toklen <= (n_elem + 1) * 4) + return -EINVAL; + + _debug("n_elem %d", n_elem); + + td = kcalloc(n_elem, sizeof(struct krb5_tagged_data), + GFP_KERNEL); + if (!td) + return -ENOMEM; + *_td = td; + + for (loop = 0; loop < n_elem; loop++) { + ret = rxrpc_krb5_decode_tagged_data(&td[loop], + max_elem_size, + &xdr, &toklen); + if (ret < 0) + return ret; + } + } + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * extract a krb5 ticket + */ +static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen, + const __be32 **_xdr, unsigned int *_toklen) +{ + const __be32 *xdr = *_xdr; + unsigned int toklen = *_toklen, len; + + /* there must be at least one length word */ + if (toklen <= 4) + return -EINVAL; + + _enter(",{%x},%u", ntohl(xdr[0]), toklen); + + len = ntohl(*xdr++); + toklen -= 4; + if (len > AFSTOKEN_K5_TIX_MAX) + return -EINVAL; + *_tktlen = len; + + _debug("ticket len %u", len); + + if (len > 0) { + *_ticket = kmemdup(xdr, len, GFP_KERNEL); + if (!*_ticket) + return -ENOMEM; + len = (len + 3) & ~3; + toklen -= len; + xdr += len >> 2; + } + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * parse an RxK5 type XDR format token + * - the caller guarantees we have at least 4 words + */ +static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep, + size_t datalen, + const __be32 *xdr, unsigned int toklen) +{ + struct rxrpc_key_token *token, **pptoken; + struct rxk5_key *rxk5; + const __be32 *end_xdr = xdr + (toklen >> 2); + int ret; + + _enter(",{%x,%x,%x,%x},%u", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + /* reserve some payload space for this subkey - the length of the token + * is a reasonable approximation */ + prep->quotalen = datalen + toklen; + + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + return -ENOMEM; + + rxk5 = kzalloc(sizeof(*rxk5), GFP_KERNEL); + if (!rxk5) { + kfree(token); + return -ENOMEM; + } + + token->security_index = RXRPC_SECURITY_RXK5; + token->k5 = rxk5; + + /* extract the principals */ + ret = rxrpc_krb5_decode_principal(&rxk5->client, &xdr, &toklen); + if (ret < 0) + goto error; + ret = rxrpc_krb5_decode_principal(&rxk5->server, &xdr, &toklen); + if (ret < 0) + goto error; + + /* extract the session key and the encoding type (the tag field -> + * ENCTYPE_xxx) */ + ret = rxrpc_krb5_decode_tagged_data(&rxk5->session, AFSTOKEN_DATA_MAX, + &xdr, &toklen); + if (ret < 0) + goto error; + + if (toklen < 4 * 8 + 2 * 4) + goto inval; + rxk5->authtime = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->starttime = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->endtime = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->renew_till = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->is_skey = ntohl(*xdr++); + rxk5->flags = ntohl(*xdr++); + toklen -= 4 * 8 + 2 * 4; + + _debug("times: a=%llx s=%llx e=%llx rt=%llx", + rxk5->authtime, rxk5->starttime, rxk5->endtime, + rxk5->renew_till); + _debug("is_skey=%x flags=%x", rxk5->is_skey, rxk5->flags); + + /* extract the permitted client addresses */ + ret = rxrpc_krb5_decode_tagged_array(&rxk5->addresses, + &rxk5->n_addresses, + AFSTOKEN_K5_ADDRESSES_MAX, + AFSTOKEN_DATA_MAX, + &xdr, &toklen); + if (ret < 0) + goto error; + + ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); + + /* extract the tickets */ + ret = rxrpc_krb5_decode_ticket(&rxk5->ticket, &rxk5->ticket_len, + &xdr, &toklen); + if (ret < 0) + goto error; + ret = rxrpc_krb5_decode_ticket(&rxk5->ticket2, &rxk5->ticket2_len, + &xdr, &toklen); + if (ret < 0) + goto error; + + ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); + + /* extract the typed auth data */ + ret = rxrpc_krb5_decode_tagged_array(&rxk5->authdata, + &rxk5->n_authdata, + AFSTOKEN_K5_AUTHDATA_MAX, + AFSTOKEN_BDATALN_MAX, + &xdr, &toklen); + if (ret < 0) + goto error; + + ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); + + if (toklen != 0) + goto inval; + + /* attach the payload */ + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + if (token->kad->expiry < prep->expiry) + prep->expiry = token->kad->expiry; + + _leave(" = 0"); + return 0; + +inval: + ret = -EINVAL; +error: + rxrpc_rxk5_free(rxk5); + kfree(token); + _leave(" = %d", ret); + return ret; +} + +/* + * attempt to parse the data as the XDR format + * - the caller guarantees we have more than 7 words + */ +static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) +{ + const __be32 *xdr = prep->data, *token; + const char *cp; + unsigned int len, tmp, loop, ntoken, toklen, sec_ix; + size_t datalen = prep->datalen; + int ret; + + _enter(",{%x,%x,%x,%x},%zu", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + prep->datalen); + + if (datalen > AFSTOKEN_LENGTH_MAX) + goto not_xdr; + + /* XDR is an array of __be32's */ + if (datalen & 3) + goto not_xdr; + + /* the flags should be 0 (the setpag bit must be handled by + * userspace) */ + if (ntohl(*xdr++) != 0) + goto not_xdr; + datalen -= 4; + + /* check the cell name */ + len = ntohl(*xdr++); + if (len < 1 || len > AFSTOKEN_CELL_MAX) + goto not_xdr; + datalen -= 4; + tmp = (len + 3) & ~3; + if (tmp > datalen) + goto not_xdr; + + cp = (const char *) xdr; + for (loop = 0; loop < len; loop++) + if (!isprint(cp[loop])) + goto not_xdr; + if (len < tmp) + for (; loop < tmp; loop++) + if (cp[loop]) + goto not_xdr; + _debug("cellname: [%u/%u] '%*.*s'", + len, tmp, len, len, (const char *) xdr); + datalen -= tmp; + xdr += tmp >> 2; + + /* get the token count */ + if (datalen < 12) + goto not_xdr; + ntoken = ntohl(*xdr++); + datalen -= 4; + _debug("ntoken: %x", ntoken); + if (ntoken < 1 || ntoken > AFSTOKEN_MAX) + goto not_xdr; + + /* check each token wrapper */ + token = xdr; + loop = ntoken; + do { + if (datalen < 8) + goto not_xdr; + toklen = ntohl(*xdr++); + sec_ix = ntohl(*xdr); + datalen -= 4; + _debug("token: [%x/%zx] %x", toklen, datalen, sec_ix); + if (toklen < 20 || toklen > datalen) + goto not_xdr; + datalen -= (toklen + 3) & ~3; + xdr += (toklen + 3) >> 2; + + } while (--loop > 0); + + _debug("remainder: %zu", datalen); + if (datalen != 0) + goto not_xdr; + + /* okay: we're going to assume it's valid XDR format + * - we ignore the cellname, relying on the key to be correctly named + */ + do { + xdr = token; + toklen = ntohl(*xdr++); + token = xdr + ((toklen + 3) >> 2); + sec_ix = ntohl(*xdr++); + toklen -= 4; + + _debug("TOKEN type=%u [%p-%p]", sec_ix, xdr, token); + + switch (sec_ix) { + case RXRPC_SECURITY_RXKAD: + ret = rxrpc_preparse_xdr_rxkad(prep, datalen, xdr, toklen); + if (ret != 0) + goto error; + break; + + case RXRPC_SECURITY_RXK5: + ret = rxrpc_preparse_xdr_rxk5(prep, datalen, xdr, toklen); + if (ret != 0) + goto error; + break; + + default: + ret = -EPROTONOSUPPORT; + goto error; + } + + } while (--ntoken > 0); + + _leave(" = 0"); + return 0; + +not_xdr: + _leave(" = -EPROTO"); + return -EPROTO; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * Preparse an rxrpc defined key. + * + * Data should be of the form: + * OFFSET LEN CONTENT + * 0 4 key interface version number + * 4 2 security index (type) + * 6 2 ticket length + * 8 4 key expiry time (time_t) + * 12 4 kvno + * 16 8 session key + * 24 [len] ticket + * + * if no data is provided, then a no-security key is made + */ +static int rxrpc_preparse(struct key_preparsed_payload *prep) +{ + const struct rxrpc_key_data_v1 *v1; + struct rxrpc_key_token *token, **pp; + size_t plen; + u32 kver; + int ret; + + _enter("%zu", prep->datalen); + + /* handle a no-security key */ + if (!prep->data && prep->datalen == 0) + return 0; + + /* determine if the XDR payload format is being used */ + if (prep->datalen > 7 * 4) { + ret = rxrpc_preparse_xdr(prep); + if (ret != -EPROTO) + return ret; + } + + /* get the key interface version number */ + ret = -EINVAL; + if (prep->datalen <= 4 || !prep->data) + goto error; + memcpy(&kver, prep->data, sizeof(kver)); + prep->data += sizeof(kver); + prep->datalen -= sizeof(kver); + + _debug("KEY I/F VERSION: %u", kver); + + ret = -EKEYREJECTED; + if (kver != 1) + goto error; + + /* deal with a version 1 key */ + ret = -EINVAL; + if (prep->datalen < sizeof(*v1)) + goto error; + + v1 = prep->data; + if (prep->datalen != sizeof(*v1) + v1->ticket_length) + goto error; + + _debug("SCIX: %u", v1->security_index); + _debug("TLEN: %u", v1->ticket_length); + _debug("EXPY: %x", v1->expiry); + _debug("KVNO: %u", v1->kvno); + _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", + v1->session_key[0], v1->session_key[1], + v1->session_key[2], v1->session_key[3], + v1->session_key[4], v1->session_key[5], + v1->session_key[6], v1->session_key[7]); + if (v1->ticket_length >= 8) + _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", + v1->ticket[0], v1->ticket[1], + v1->ticket[2], v1->ticket[3], + v1->ticket[4], v1->ticket[5], + v1->ticket[6], v1->ticket[7]); + + ret = -EPROTONOSUPPORT; + if (v1->security_index != RXRPC_SECURITY_RXKAD) + goto error; + + plen = sizeof(*token->kad) + v1->ticket_length; + prep->quotalen = plen + sizeof(*token); + + ret = -ENOMEM; + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + goto error; + token->kad = kzalloc(plen, GFP_KERNEL); + if (!token->kad) + goto error_free; + + token->security_index = RXRPC_SECURITY_RXKAD; + token->kad->ticket_len = v1->ticket_length; + token->kad->expiry = v1->expiry; + token->kad->kvno = v1->kvno; + memcpy(&token->kad->session_key, &v1->session_key, 8); + memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length); + + /* count the number of tokens attached */ + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); + + /* attach the data */ + pp = (struct rxrpc_key_token **)&prep->payload.data[0]; + while (*pp) + pp = &(*pp)->next; + *pp = token; + if (token->kad->expiry < prep->expiry) + prep->expiry = token->kad->expiry; + token = NULL; + ret = 0; + +error_free: + kfree(token); +error: + return ret; +} + +/* + * Free token list. + */ +static void rxrpc_free_token_list(struct rxrpc_key_token *token) +{ + struct rxrpc_key_token *next; + + for (; token; token = next) { + next = token->next; + switch (token->security_index) { + case RXRPC_SECURITY_RXKAD: + kfree(token->kad); + break; + case RXRPC_SECURITY_RXK5: + if (token->k5) + rxrpc_rxk5_free(token->k5); + break; + default: + pr_err("Unknown token type %x on rxrpc key\n", + token->security_index); + BUG(); + } + + kfree(token); + } +} + +/* + * Clean up preparse data. + */ +static void rxrpc_free_preparse(struct key_preparsed_payload *prep) +{ + rxrpc_free_token_list(prep->payload.data[0]); +} + +/* + * Preparse a server secret key. + * + * The data should be the 8-byte secret key. + */ +static int rxrpc_preparse_s(struct key_preparsed_payload *prep) +{ + struct crypto_skcipher *ci; + + _enter("%zu", prep->datalen); + + if (prep->datalen != 8) + return -EINVAL; + + memcpy(&prep->payload.data[2], prep->data, 8); + + ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(ci)) { + _leave(" = %ld", PTR_ERR(ci)); + return PTR_ERR(ci); + } + + if (crypto_skcipher_setkey(ci, prep->data, 8) < 0) + BUG(); + + prep->payload.data[0] = ci; + _leave(" = 0"); + return 0; +} + +/* + * Clean up preparse data. + */ +static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) +{ + if (prep->payload.data[0]) + crypto_free_skcipher(prep->payload.data[0]); +} + +/* + * dispose of the data dangling from the corpse of a rxrpc key + */ +static void rxrpc_destroy(struct key *key) +{ + rxrpc_free_token_list(key->payload.data[0]); +} + +/* + * dispose of the data dangling from the corpse of a rxrpc key + */ +static void rxrpc_destroy_s(struct key *key) +{ + if (key->payload.data[0]) { + crypto_free_skcipher(key->payload.data[0]); + key->payload.data[0] = NULL; + } +} + +/* + * describe the rxrpc key + */ +static void rxrpc_describe(const struct key *key, struct seq_file *m) +{ + seq_puts(m, key->description); +} + +/* + * grab the security key for a socket + */ +int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) +{ + struct key *key; + char *description; + + _enter(""); + + if (optlen <= 0 || optlen > PAGE_SIZE - 1) + return -EINVAL; + + description = memdup_user_nul(optval, optlen); + if (IS_ERR(description)) + return PTR_ERR(description); + + key = request_key(&key_type_rxrpc, description, NULL); + if (IS_ERR(key)) { + kfree(description); + _leave(" = %ld", PTR_ERR(key)); + return PTR_ERR(key); + } + + rx->key = key; + kfree(description); + _leave(" = 0 [key %x]", key->serial); + return 0; +} + +/* + * grab the security keyring for a server socket + */ +int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, + int optlen) +{ + struct key *key; + char *description; + + _enter(""); + + if (optlen <= 0 || optlen > PAGE_SIZE - 1) + return -EINVAL; + + description = memdup_user_nul(optval, optlen); + if (IS_ERR(description)) + return PTR_ERR(description); + + key = request_key(&key_type_keyring, description, NULL); + if (IS_ERR(key)) { + kfree(description); + _leave(" = %ld", PTR_ERR(key)); + return PTR_ERR(key); + } + + rx->securities = key; + kfree(description); + _leave(" = 0 [key %x]", key->serial); + return 0; +} + +/* + * generate a server data key + */ +int rxrpc_get_server_data_key(struct rxrpc_connection *conn, + const void *session_key, + time_t expiry, + u32 kvno) +{ + const struct cred *cred = current_cred(); + struct key *key; + int ret; + + struct { + u32 kver; + struct rxrpc_key_data_v1 v1; + } data; + + _enter(""); + + key = key_alloc(&key_type_rxrpc, "x", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) { + _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); + return -ENOMEM; + } + + _debug("key %d", key_serial(key)); + + data.kver = 1; + data.v1.security_index = RXRPC_SECURITY_RXKAD; + data.v1.ticket_length = 0; + data.v1.expiry = expiry; + data.v1.kvno = 0; + + memcpy(&data.v1.session_key, session_key, sizeof(data.v1.session_key)); + + ret = key_instantiate_and_link(key, &data, sizeof(data), NULL, NULL); + if (ret < 0) + goto error; + + conn->key = key; + _leave(" = 0 [%d]", key_serial(key)); + return 0; + +error: + key_revoke(key); + key_put(key); + _leave(" = -ENOMEM [ins %d]", ret); + return -ENOMEM; +} +EXPORT_SYMBOL(rxrpc_get_server_data_key); + +/** + * rxrpc_get_null_key - Generate a null RxRPC key + * @keyname: The name to give the key. + * + * Generate a null RxRPC key that can be used to indicate anonymous security is + * required for a particular domain. + */ +struct key *rxrpc_get_null_key(const char *keyname) +{ + const struct cred *cred = current_cred(); + struct key *key; + int ret; + + key = key_alloc(&key_type_rxrpc, keyname, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, + KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) + return key; + + ret = key_instantiate_and_link(key, NULL, 0, NULL, NULL); + if (ret < 0) { + key_revoke(key); + key_put(key); + return ERR_PTR(ret); + } + + return key; +} +EXPORT_SYMBOL(rxrpc_get_null_key); + +/* + * read the contents of an rxrpc key + * - this returns the result in XDR form + */ +static long rxrpc_read(const struct key *key, + char __user *buffer, size_t buflen) +{ + const struct rxrpc_key_token *token; + const struct krb5_principal *princ; + size_t size; + __be32 __user *xdr, *oldxdr; + u32 cnlen, toksize, ntoks, tok, zero; + u16 toksizes[AFSTOKEN_MAX]; + int loop; + + _enter(""); + + /* we don't know what form we should return non-AFS keys in */ + if (memcmp(key->description, "afs@", 4) != 0) + return -EOPNOTSUPP; + cnlen = strlen(key->description + 4); + +#define RND(X) (((X) + 3) & ~3) + + /* AFS keys we return in XDR form, so we need to work out the size of + * the XDR */ + size = 2 * 4; /* flags, cellname len */ + size += RND(cnlen); /* cellname */ + size += 1 * 4; /* token count */ + + ntoks = 0; + for (token = key->payload.data[0]; token; token = token->next) { + toksize = 4; /* sec index */ + + switch (token->security_index) { + case RXRPC_SECURITY_RXKAD: + toksize += 8 * 4; /* viceid, kvno, key*2, begin, + * end, primary, tktlen */ + toksize += RND(token->kad->ticket_len); + break; + + case RXRPC_SECURITY_RXK5: + princ = &token->k5->client; + toksize += 4 + princ->n_name_parts * 4; + for (loop = 0; loop < princ->n_name_parts; loop++) + toksize += RND(strlen(princ->name_parts[loop])); + toksize += 4 + RND(strlen(princ->realm)); + + princ = &token->k5->server; + toksize += 4 + princ->n_name_parts * 4; + for (loop = 0; loop < princ->n_name_parts; loop++) + toksize += RND(strlen(princ->name_parts[loop])); + toksize += 4 + RND(strlen(princ->realm)); + + toksize += 8 + RND(token->k5->session.data_len); + + toksize += 4 * 8 + 2 * 4; + + toksize += 4 + token->k5->n_addresses * 8; + for (loop = 0; loop < token->k5->n_addresses; loop++) + toksize += RND(token->k5->addresses[loop].data_len); + + toksize += 4 + RND(token->k5->ticket_len); + toksize += 4 + RND(token->k5->ticket2_len); + + toksize += 4 + token->k5->n_authdata * 8; + for (loop = 0; loop < token->k5->n_authdata; loop++) + toksize += RND(token->k5->authdata[loop].data_len); + break; + + default: /* we have a ticket we can't encode */ + BUG(); + continue; + } + + _debug("token[%u]: toksize=%u", ntoks, toksize); + ASSERTCMP(toksize, <=, AFSTOKEN_LENGTH_MAX); + + toksizes[ntoks++] = toksize; + size += toksize + 4; /* each token has a length word */ + } + +#undef RND + + if (!buffer || buflen < size) + return size; + + xdr = (__be32 __user *) buffer; + zero = 0; +#define ENCODE(x) \ + do { \ + __be32 y = htonl(x); \ + if (put_user(y, xdr++) < 0) \ + goto fault; \ + } while(0) +#define ENCODE_DATA(l, s) \ + do { \ + u32 _l = (l); \ + ENCODE(l); \ + if (copy_to_user(xdr, (s), _l) != 0) \ + goto fault; \ + if (_l & 3 && \ + copy_to_user((u8 __user *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \ + goto fault; \ + xdr += (_l + 3) >> 2; \ + } while(0) +#define ENCODE64(x) \ + do { \ + __be64 y = cpu_to_be64(x); \ + if (copy_to_user(xdr, &y, 8) != 0) \ + goto fault; \ + xdr += 8 >> 2; \ + } while(0) +#define ENCODE_STR(s) \ + do { \ + const char *_s = (s); \ + ENCODE_DATA(strlen(_s), _s); \ + } while(0) + + ENCODE(0); /* flags */ + ENCODE_DATA(cnlen, key->description + 4); /* cellname */ + ENCODE(ntoks); + + tok = 0; + for (token = key->payload.data[0]; token; token = token->next) { + toksize = toksizes[tok++]; + ENCODE(toksize); + oldxdr = xdr; + ENCODE(token->security_index); + + switch (token->security_index) { + case RXRPC_SECURITY_RXKAD: + ENCODE(token->kad->vice_id); + ENCODE(token->kad->kvno); + ENCODE_DATA(8, token->kad->session_key); + ENCODE(token->kad->start); + ENCODE(token->kad->expiry); + ENCODE(token->kad->primary_flag); + ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); + break; + + case RXRPC_SECURITY_RXK5: + princ = &token->k5->client; + ENCODE(princ->n_name_parts); + for (loop = 0; loop < princ->n_name_parts; loop++) + ENCODE_STR(princ->name_parts[loop]); + ENCODE_STR(princ->realm); + + princ = &token->k5->server; + ENCODE(princ->n_name_parts); + for (loop = 0; loop < princ->n_name_parts; loop++) + ENCODE_STR(princ->name_parts[loop]); + ENCODE_STR(princ->realm); + + ENCODE(token->k5->session.tag); + ENCODE_DATA(token->k5->session.data_len, + token->k5->session.data); + + ENCODE64(token->k5->authtime); + ENCODE64(token->k5->starttime); + ENCODE64(token->k5->endtime); + ENCODE64(token->k5->renew_till); + ENCODE(token->k5->is_skey); + ENCODE(token->k5->flags); + + ENCODE(token->k5->n_addresses); + for (loop = 0; loop < token->k5->n_addresses; loop++) { + ENCODE(token->k5->addresses[loop].tag); + ENCODE_DATA(token->k5->addresses[loop].data_len, + token->k5->addresses[loop].data); + } + + ENCODE_DATA(token->k5->ticket_len, token->k5->ticket); + ENCODE_DATA(token->k5->ticket2_len, token->k5->ticket2); + + ENCODE(token->k5->n_authdata); + for (loop = 0; loop < token->k5->n_authdata; loop++) { + ENCODE(token->k5->authdata[loop].tag); + ENCODE_DATA(token->k5->authdata[loop].data_len, + token->k5->authdata[loop].data); + } + break; + + default: + BUG(); + break; + } + + ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==, + toksize); + } + +#undef ENCODE_STR +#undef ENCODE_DATA +#undef ENCODE64 +#undef ENCODE + + ASSERTCMP(tok, ==, ntoks); + ASSERTCMP((char __user *) xdr - buffer, ==, size); + _leave(" = %zu", size); + return size; + +fault: + _leave(" = -EFAULT"); + return -EFAULT; +} diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c new file mode 100644 index 000000000000..111f250b045f --- /dev/null +++ b/net/rxrpc/local_object.c @@ -0,0 +1,417 @@ +/* AF_RXRPC local endpoint management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC"; + +static LIST_HEAD(rxrpc_locals); +DEFINE_RWLOCK(rxrpc_local_lock); +static DECLARE_RWSEM(rxrpc_local_sem); +static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq); + +static void rxrpc_destroy_local(struct work_struct *work); +static void rxrpc_process_local_events(struct work_struct *work); + +/* + * allocate a new local + */ +static +struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx) +{ + struct rxrpc_local *local; + + local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); + if (local) { + INIT_WORK(&local->destroyer, &rxrpc_destroy_local); + INIT_WORK(&local->acceptor, &rxrpc_accept_incoming_calls); + INIT_WORK(&local->rejecter, &rxrpc_reject_packets); + INIT_WORK(&local->event_processor, &rxrpc_process_local_events); + INIT_LIST_HEAD(&local->services); + INIT_LIST_HEAD(&local->link); + init_rwsem(&local->defrag_sem); + skb_queue_head_init(&local->accept_queue); + skb_queue_head_init(&local->reject_queue); + skb_queue_head_init(&local->event_queue); + spin_lock_init(&local->lock); + rwlock_init(&local->services_lock); + atomic_set(&local->usage, 1); + local->debug_id = atomic_inc_return(&rxrpc_debug_id); + memcpy(&local->srx, srx, sizeof(*srx)); + } + + _leave(" = %p", local); + return local; +} + +/* + * create the local socket + * - must be called with rxrpc_local_sem writelocked + */ +static int rxrpc_create_local(struct rxrpc_local *local) +{ + struct sock *sock; + int ret, opt; + + _enter("%p{%d}", local, local->srx.transport_type); + + /* create a socket to represent the local endpoint */ + ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type, + IPPROTO_UDP, &local->socket); + if (ret < 0) { + _leave(" = %d [socket]", ret); + return ret; + } + + /* if a local address was supplied then bind it */ + if (local->srx.transport_len > sizeof(sa_family_t)) { + _debug("bind"); + ret = kernel_bind(local->socket, + (struct sockaddr *) &local->srx.transport, + local->srx.transport_len); + if (ret < 0) { + _debug("bind failed"); + goto error; + } + } + + /* we want to receive ICMP errors */ + opt = 1; + ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, + (char *) &opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } + + /* we want to set the don't fragment bit */ + opt = IP_PMTUDISC_DO; + ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, + (char *) &opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } + + write_lock_bh(&rxrpc_local_lock); + list_add(&local->link, &rxrpc_locals); + write_unlock_bh(&rxrpc_local_lock); + + /* set the socket up */ + sock = local->socket->sk; + sock->sk_user_data = local; + sock->sk_data_ready = rxrpc_data_ready; + sock->sk_error_report = rxrpc_UDP_error_report; + _leave(" = 0"); + return 0; + +error: + kernel_sock_shutdown(local->socket, SHUT_RDWR); + local->socket->sk->sk_user_data = NULL; + sock_release(local->socket); + local->socket = NULL; + + _leave(" = %d", ret); + return ret; +} + +/* + * create a new local endpoint using the specified UDP address + */ +struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *srx) +{ + struct rxrpc_local *local; + int ret; + + _enter("{%d,%u,%pI4+%hu}", + srx->transport_type, + srx->transport.family, + &srx->transport.sin.sin_addr, + ntohs(srx->transport.sin.sin_port)); + + down_write(&rxrpc_local_sem); + + /* see if we have a suitable local local endpoint already */ + read_lock_bh(&rxrpc_local_lock); + + list_for_each_entry(local, &rxrpc_locals, link) { + _debug("CMP {%d,%u,%pI4+%hu}", + local->srx.transport_type, + local->srx.transport.family, + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port)); + + if (local->srx.transport_type != srx->transport_type || + local->srx.transport.family != srx->transport.family) + continue; + + switch (srx->transport.family) { + case AF_INET: + if (local->srx.transport.sin.sin_port != + srx->transport.sin.sin_port) + continue; + if (memcmp(&local->srx.transport.sin.sin_addr, + &srx->transport.sin.sin_addr, + sizeof(struct in_addr)) != 0) + continue; + goto found_local; + + default: + BUG(); + } + } + + read_unlock_bh(&rxrpc_local_lock); + + /* we didn't find one, so we need to create one */ + local = rxrpc_alloc_local(srx); + if (!local) { + up_write(&rxrpc_local_sem); + return ERR_PTR(-ENOMEM); + } + + ret = rxrpc_create_local(local); + if (ret < 0) { + up_write(&rxrpc_local_sem); + kfree(local); + _leave(" = %d", ret); + return ERR_PTR(ret); + } + + up_write(&rxrpc_local_sem); + + _net("LOCAL new %d {%d,%u,%pI4+%hu}", + local->debug_id, + local->srx.transport_type, + local->srx.transport.family, + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port)); + + _leave(" = %p [new]", local); + return local; + +found_local: + rxrpc_get_local(local); + read_unlock_bh(&rxrpc_local_lock); + up_write(&rxrpc_local_sem); + + _net("LOCAL old %d {%d,%u,%pI4+%hu}", + local->debug_id, + local->srx.transport_type, + local->srx.transport.family, + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port)); + + _leave(" = %p [reuse]", local); + return local; +} + +/* + * release a local endpoint + */ +void rxrpc_put_local(struct rxrpc_local *local) +{ + _enter("%p{u=%d}", local, atomic_read(&local->usage)); + + ASSERTCMP(atomic_read(&local->usage), >, 0); + + /* to prevent a race, the decrement and the dequeue must be effectively + * atomic */ + write_lock_bh(&rxrpc_local_lock); + if (unlikely(atomic_dec_and_test(&local->usage))) { + _debug("destroy local"); + rxrpc_queue_work(&local->destroyer); + } + write_unlock_bh(&rxrpc_local_lock); + _leave(""); +} + +/* + * destroy a local endpoint + */ +static void rxrpc_destroy_local(struct work_struct *work) +{ + struct rxrpc_local *local = + container_of(work, struct rxrpc_local, destroyer); + + _enter("%p{%d}", local, atomic_read(&local->usage)); + + down_write(&rxrpc_local_sem); + + write_lock_bh(&rxrpc_local_lock); + if (atomic_read(&local->usage) > 0) { + write_unlock_bh(&rxrpc_local_lock); + up_read(&rxrpc_local_sem); + _leave(" [resurrected]"); + return; + } + + list_del(&local->link); + local->socket->sk->sk_user_data = NULL; + write_unlock_bh(&rxrpc_local_lock); + + downgrade_write(&rxrpc_local_sem); + + ASSERT(list_empty(&local->services)); + ASSERT(!work_pending(&local->acceptor)); + ASSERT(!work_pending(&local->rejecter)); + ASSERT(!work_pending(&local->event_processor)); + + /* finish cleaning up the local descriptor */ + rxrpc_purge_queue(&local->accept_queue); + rxrpc_purge_queue(&local->reject_queue); + rxrpc_purge_queue(&local->event_queue); + kernel_sock_shutdown(local->socket, SHUT_RDWR); + sock_release(local->socket); + + up_read(&rxrpc_local_sem); + + _net("DESTROY LOCAL %d", local->debug_id); + kfree(local); + + if (list_empty(&rxrpc_locals)) + wake_up_all(&rxrpc_local_wq); + + _leave(""); +} + +/* + * preemptively destroy all local local endpoint rather than waiting for + * them to be destroyed + */ +void __exit rxrpc_destroy_all_locals(void) +{ + DECLARE_WAITQUEUE(myself,current); + + _enter(""); + + /* we simply have to wait for them to go away */ + if (!list_empty(&rxrpc_locals)) { + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&rxrpc_local_wq, &myself); + + while (!list_empty(&rxrpc_locals)) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + remove_wait_queue(&rxrpc_local_wq, &myself); + set_current_state(TASK_RUNNING); + } + + _leave(""); +} + +/* + * Reply to a version request + */ +static void rxrpc_send_version_request(struct rxrpc_local *local, + struct rxrpc_host_header *hdr, + struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct sockaddr_in sin; + struct msghdr msg; + struct kvec iov[2]; + size_t len; + int ret; + + _enter(""); + + sin.sin_family = AF_INET; + sin.sin_port = udp_hdr(skb)->source; + sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + + msg.msg_name = &sin; + msg.msg_namelen = sizeof(sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = 0; + whdr.serial = 0; + whdr.type = RXRPC_PACKET_TYPE_VERSION; + whdr.flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); + whdr.userStatus = 0; + whdr.securityIndex = 0; + whdr._rsvd = 0; + whdr.serviceId = htons(sp->hdr.serviceId); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = (char *)rxrpc_version_string; + iov[1].iov_len = sizeof(rxrpc_version_string); + + len = iov[0].iov_len + iov[1].iov_len; + + _proto("Tx VERSION (reply)"); + + ret = kernel_sendmsg(local->socket, &msg, iov, 2, len); + if (ret < 0) + _debug("sendmsg failed: %d", ret); + + _leave(""); +} + +/* + * Process event packets targetted at a local endpoint. + */ +static void rxrpc_process_local_events(struct work_struct *work) +{ + struct rxrpc_local *local = container_of(work, struct rxrpc_local, event_processor); + struct sk_buff *skb; + char v; + + _enter(""); + + atomic_inc(&local->usage); + + while ((skb = skb_dequeue(&local->event_queue))) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + _debug("{%d},{%u}", local->debug_id, sp->hdr.type); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_VERSION: + if (skb_copy_bits(skb, 0, &v, 1) < 0) + return; + _proto("Rx VERSION { %02x }", v); + if (v == 0) + rxrpc_send_version_request(local, &sp->hdr, skb); + break; + + default: + /* Just ignore anything we don't understand */ + break; + } + + rxrpc_put_local(local); + rxrpc_free_skb(skb); + } + + rxrpc_put_local(local); + _leave(""); +} diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c new file mode 100644 index 000000000000..2e3c4064e29c --- /dev/null +++ b/net/rxrpc/output.c @@ -0,0 +1,724 @@ +/* RxRPC packet transmission + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * Time till packet resend (in jiffies). + */ +unsigned int rxrpc_resend_timeout = 4 * HZ; + +static int rxrpc_send_data(struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct msghdr *msg, size_t len); + +/* + * extract control messages from the sendmsg() control buffer + */ +static int rxrpc_sendmsg_cmsg(struct msghdr *msg, + unsigned long *user_call_ID, + enum rxrpc_command *command, + u32 *abort_code) +{ + struct cmsghdr *cmsg; + bool got_user_ID = false; + int len; + + *command = RXRPC_CMD_SEND_DATA; + + if (msg->msg_controllen == 0) + return -EINVAL; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + _debug("CMSG %d, %d, %d", + cmsg->cmsg_level, cmsg->cmsg_type, len); + + if (cmsg->cmsg_level != SOL_RXRPC) + continue; + + switch (cmsg->cmsg_type) { + case RXRPC_USER_CALL_ID: + if (msg->msg_flags & MSG_CMSG_COMPAT) { + if (len != sizeof(u32)) + return -EINVAL; + *user_call_ID = *(u32 *) CMSG_DATA(cmsg); + } else { + if (len != sizeof(unsigned long)) + return -EINVAL; + *user_call_ID = *(unsigned long *) + CMSG_DATA(cmsg); + } + _debug("User Call ID %lx", *user_call_ID); + got_user_ID = true; + break; + + case RXRPC_ABORT: + if (*command != RXRPC_CMD_SEND_DATA) + return -EINVAL; + *command = RXRPC_CMD_SEND_ABORT; + if (len != sizeof(*abort_code)) + return -EINVAL; + *abort_code = *(unsigned int *) CMSG_DATA(cmsg); + _debug("Abort %x", *abort_code); + if (*abort_code == 0) + return -EINVAL; + break; + + case RXRPC_ACCEPT: + if (*command != RXRPC_CMD_SEND_DATA) + return -EINVAL; + *command = RXRPC_CMD_ACCEPT; + if (len != 0) + return -EINVAL; + break; + + default: + return -EINVAL; + } + } + + if (!got_user_ID) + return -EINVAL; + _leave(" = 0"); + return 0; +} + +/* + * abort a call, sending an ABORT packet to the peer + */ +static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) +{ + write_lock_bh(&call->state_lock); + + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->local_abort = abort_code; + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_EV_ACK, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + rxrpc_queue_call(call); + } + + write_unlock_bh(&call->state_lock); +} + +/* + * Create a new client call for sendmsg(). + */ +static struct rxrpc_call * +rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, + unsigned long user_call_ID) +{ + struct rxrpc_conn_bundle *bundle; + struct rxrpc_transport *trans; + struct rxrpc_call *call; + struct key *key; + long ret; + + DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); + + _enter(""); + + if (!msg->msg_name) + return ERR_PTR(-EDESTADDRREQ); + + trans = rxrpc_name_to_transport(rx, msg->msg_name, msg->msg_namelen, 0, + GFP_KERNEL); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + key = rx->key; + if (key && !rx->key->payload.data[0]) + key = NULL; + bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, GFP_KERNEL); + if (IS_ERR(bundle)) { + ret = PTR_ERR(bundle); + goto out_trans; + } + + call = rxrpc_new_client_call(rx, trans, bundle, user_call_ID, + GFP_KERNEL); + rxrpc_put_bundle(trans, bundle); + rxrpc_put_transport(trans); + if (IS_ERR(call)) { + ret = PTR_ERR(call); + goto out_trans; + } + + _leave(" = %p\n", call); + return call; + +out_trans: + rxrpc_put_transport(trans); +out: + _leave(" = %ld", ret); + return ERR_PTR(ret); +} + +/* + * send a message forming part of a client call through an RxRPC socket + * - caller holds the socket locked + * - the socket may be either a client socket or a server socket + */ +int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) +{ + enum rxrpc_command cmd; + struct rxrpc_call *call; + unsigned long user_call_ID = 0; + u32 abort_code = 0; + int ret; + + _enter(""); + + ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code); + if (ret < 0) + return ret; + + if (cmd == RXRPC_CMD_ACCEPT) { + if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) + return -EINVAL; + call = rxrpc_accept_call(rx, user_call_ID); + if (IS_ERR(call)) + return PTR_ERR(call); + rxrpc_put_call(call); + return 0; + } + + call = rxrpc_find_call_by_user_ID(rx, user_call_ID); + if (!call) { + if (cmd != RXRPC_CMD_SEND_DATA) + return -EBADSLT; + call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID); + if (IS_ERR(call)) + return PTR_ERR(call); + } + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state >= RXRPC_CALL_COMPLETE) { + /* it's too late for this call */ + ret = -ECONNRESET; + } else if (cmd == RXRPC_CMD_SEND_ABORT) { + rxrpc_send_abort(call, abort_code); + ret = 0; + } else if (cmd != RXRPC_CMD_SEND_DATA) { + ret = -EINVAL; + } else if (!call->in_clientflag && + call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { + /* request phase complete for this client call */ + ret = -EPROTO; + } else if (call->in_clientflag && + call->state != RXRPC_CALL_SERVER_ACK_REQUEST && + call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + /* Reply phase not begun or not complete for service call. */ + ret = -EPROTO; + } else { + ret = rxrpc_send_data(rx, call, msg, len); + } + + rxrpc_put_call(call); + _leave(" = %d", ret); + return ret; +} + +/** + * rxrpc_kernel_send_data - Allow a kernel service to send data on a call + * @call: The call to send data through + * @msg: The data to send + * @len: The amount of data to send + * + * Allow a kernel service to send data on a call. The call must be in an state + * appropriate to sending data. No control data should be supplied in @msg, + * nor should an address be supplied. MSG_MORE should be flagged if there's + * more data to come, otherwise this data will end the transmission phase. + */ +int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, + size_t len) +{ + int ret; + + _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); + + ASSERTCMP(msg->msg_name, ==, NULL); + ASSERTCMP(msg->msg_control, ==, NULL); + + lock_sock(&call->socket->sk); + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state >= RXRPC_CALL_COMPLETE) { + ret = -ESHUTDOWN; /* it's too late for this call */ + } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && + call->state != RXRPC_CALL_SERVER_ACK_REQUEST && + call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + ret = -EPROTO; /* request phase complete for this client call */ + } else { + ret = rxrpc_send_data(call->socket, call, msg, len); + } + + release_sock(&call->socket->sk); + _leave(" = %d", ret); + return ret; +} + +EXPORT_SYMBOL(rxrpc_kernel_send_data); + +/** + * rxrpc_kernel_abort_call - Allow a kernel service to abort a call + * @call: The call to be aborted + * @abort_code: The abort code to stick into the ABORT packet + * + * Allow a kernel service to abort a call, if it's still in an abortable state. + */ +void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code) +{ + _enter("{%d},%d", call->debug_id, abort_code); + + lock_sock(&call->socket->sk); + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state < RXRPC_CALL_COMPLETE) + rxrpc_send_abort(call, abort_code); + + release_sock(&call->socket->sk); + _leave(""); +} + +EXPORT_SYMBOL(rxrpc_kernel_abort_call); + +/* + * send a packet through the transport endpoint + */ +int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb) +{ + struct kvec iov[1]; + struct msghdr msg; + int ret, opt; + + _enter(",{%d}", skb->len); + + iov[0].iov_base = skb->head; + iov[0].iov_len = skb->len; + + msg.msg_name = &trans->peer->srx.transport.sin; + msg.msg_namelen = sizeof(trans->peer->srx.transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + /* send the packet with the don't fragment bit set if we currently + * think it's small enough */ + if (skb->len - sizeof(struct rxrpc_wire_header) < trans->peer->maxdata) { + down_read(&trans->local->defrag_sem); + /* send the packet by UDP + * - returns -EMSGSIZE if UDP would have to fragment the packet + * to go out of the interface + * - in which case, we'll have processed the ICMP error + * message and update the peer record + */ + ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1, + iov[0].iov_len); + + up_read(&trans->local->defrag_sem); + if (ret == -EMSGSIZE) + goto send_fragmentable; + + _leave(" = %d [%u]", ret, trans->peer->maxdata); + return ret; + } + +send_fragmentable: + /* attempt to send this message with fragmentation enabled */ + _debug("send fragment"); + + down_write(&trans->local->defrag_sem); + opt = IP_PMTUDISC_DONT; + ret = kernel_setsockopt(trans->local->socket, SOL_IP, IP_MTU_DISCOVER, + (char *) &opt, sizeof(opt)); + if (ret == 0) { + ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1, + iov[0].iov_len); + + opt = IP_PMTUDISC_DO; + kernel_setsockopt(trans->local->socket, SOL_IP, + IP_MTU_DISCOVER, (char *) &opt, sizeof(opt)); + } + + up_write(&trans->local->defrag_sem); + _leave(" = %d [frag %u]", ret, trans->peer->maxdata); + return ret; +} + +/* + * wait for space to appear in the transmit/ACK window + * - caller holds the socket locked + */ +static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, + struct rxrpc_call *call, + long *timeo) +{ + DECLARE_WAITQUEUE(myself, current); + int ret; + + _enter(",{%d},%ld", + CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), + call->acks_winsz), + *timeo); + + add_wait_queue(&call->tx_waitq, &myself); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + ret = 0; + if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), + call->acks_winsz) > 0) + break; + if (signal_pending(current)) { + ret = sock_intr_errno(*timeo); + break; + } + + release_sock(&rx->sk); + *timeo = schedule_timeout(*timeo); + lock_sock(&rx->sk); + } + + remove_wait_queue(&call->tx_waitq, &myself); + set_current_state(TASK_RUNNING); + _leave(" = %d", ret); + return ret; +} + +/* + * attempt to schedule an instant Tx resend + */ +static inline void rxrpc_instant_resend(struct rxrpc_call *call) +{ + read_lock_bh(&call->state_lock); + if (try_to_del_timer_sync(&call->resend_timer) >= 0) { + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); +} + +/* + * queue a packet for transmission, set the resend timer and attempt + * to send the packet immediately + */ +static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, + bool last) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + int ret; + + _net("queue skb %p [%d]", skb, call->acks_head); + + ASSERT(call->acks_window != NULL); + call->acks_window[call->acks_head] = (unsigned long) skb; + smp_wmb(); + call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1); + + if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { + _debug("________awaiting reply/ACK__________"); + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + break; + case RXRPC_CALL_SERVER_ACK_REQUEST: + call->state = RXRPC_CALL_SERVER_SEND_REPLY; + if (!last) + break; + case RXRPC_CALL_SERVER_SEND_REPLY: + call->state = RXRPC_CALL_SERVER_AWAIT_ACK; + break; + default: + break; + } + write_unlock_bh(&call->state_lock); + } + + _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); + + sp->need_resend = false; + sp->resend_at = jiffies + rxrpc_resend_timeout; + if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) { + _debug("run timer"); + call->resend_timer.expires = sp->resend_at; + add_timer(&call->resend_timer); + } + + /* attempt to cancel the rx-ACK timer, deferring reply transmission if + * we're ACK'ing the request phase of an incoming call */ + ret = -EAGAIN; + if (try_to_del_timer_sync(&call->ack_timer) >= 0) { + /* the packet may be freed by rxrpc_process_call() before this + * returns */ + ret = rxrpc_send_packet(call->conn->trans, skb); + _net("sent skb %p", skb); + } else { + _debug("failed to delete ACK timer"); + } + + if (ret < 0) { + _debug("need instant resend %d", ret); + sp->need_resend = true; + rxrpc_instant_resend(call); + } + + _leave(""); +} + +/* + * Convert a host-endian header into a network-endian header. + */ +static void rxrpc_insert_header(struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = htonl(sp->hdr.seq); + whdr.serial = htonl(sp->hdr.serial); + whdr.type = sp->hdr.type; + whdr.flags = sp->hdr.flags; + whdr.userStatus = sp->hdr.userStatus; + whdr.securityIndex = sp->hdr.securityIndex; + whdr._rsvd = htons(sp->hdr._rsvd); + whdr.serviceId = htons(sp->hdr.serviceId); + + memcpy(skb->head, &whdr, sizeof(whdr)); +} + +/* + * send data through a socket + * - must be called in process context + * - caller holds the socket locked + */ +static int rxrpc_send_data(struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct msghdr *msg, size_t len) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + struct sock *sk = &rx->sk; + long timeo; + bool more; + int ret, copied; + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + /* this should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + return -EPIPE; + + more = msg->msg_flags & MSG_MORE; + + skb = call->tx_pending; + call->tx_pending = NULL; + + copied = 0; + do { + if (!skb) { + size_t size, chunk, max, space; + + _debug("alloc"); + + if (CIRC_SPACE(call->acks_head, + ACCESS_ONCE(call->acks_tail), + call->acks_winsz) <= 0) { + ret = -EAGAIN; + if (msg->msg_flags & MSG_DONTWAIT) + goto maybe_error; + ret = rxrpc_wait_for_tx_window(rx, call, + &timeo); + if (ret < 0) + goto maybe_error; + } + + max = call->conn->trans->peer->maxdata; + max -= call->conn->security_size; + max &= ~(call->conn->size_align - 1UL); + + chunk = max; + if (chunk > msg_data_left(msg) && !more) + chunk = msg_data_left(msg); + + space = chunk + call->conn->size_align; + space &= ~(call->conn->size_align - 1UL); + + size = space + call->conn->header_size; + + _debug("SIZE: %zu/%zu/%zu", chunk, space, size); + + /* create a buffer that we can retain until it's ACK'd */ + skb = sock_alloc_send_skb( + sk, size, msg->msg_flags & MSG_DONTWAIT, &ret); + if (!skb) + goto maybe_error; + + rxrpc_new_skb(skb); + + _debug("ALLOC SEND %p", skb); + + ASSERTCMP(skb->mark, ==, 0); + + _debug("HS: %u", call->conn->header_size); + skb_reserve(skb, call->conn->header_size); + skb->len += call->conn->header_size; + + sp = rxrpc_skb(skb); + sp->remain = chunk; + if (sp->remain > skb_tailroom(skb)) + sp->remain = skb_tailroom(skb); + + _net("skb: hr %d, tr %d, hl %d, rm %d", + skb_headroom(skb), + skb_tailroom(skb), + skb_headlen(skb), + sp->remain); + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + _debug("append"); + sp = rxrpc_skb(skb); + + /* append next segment of data to the current buffer */ + if (msg_data_left(msg) > 0) { + int copy = skb_tailroom(skb); + ASSERTCMP(copy, >, 0); + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); + if (copy > sp->remain) + copy = sp->remain; + + _debug("add"); + ret = skb_add_data(skb, &msg->msg_iter, copy); + _debug("added"); + if (ret < 0) + goto efault; + sp->remain -= copy; + skb->mark += copy; + copied += copy; + } + + /* check for the far side aborting the call or a network error + * occurring */ + if (call->state > RXRPC_CALL_COMPLETE) + goto call_aborted; + + /* add the packet to the send queue if it's now full */ + if (sp->remain <= 0 || + (msg_data_left(msg) == 0 && !more)) { + struct rxrpc_connection *conn = call->conn; + uint32_t seq; + size_t pad; + + /* pad out if we're using security */ + if (conn->security_ix) { + pad = conn->security_size + skb->mark; + pad = conn->size_align - pad; + pad &= conn->size_align - 1; + _debug("pad %zu", pad); + if (pad) + memset(skb_put(skb, pad), 0, pad); + } + + seq = atomic_inc_return(&call->sequence); + + sp->hdr.epoch = conn->epoch; + sp->hdr.cid = call->cid; + sp->hdr.callNumber = call->call_id; + sp->hdr.seq = seq; + sp->hdr.serial = atomic_inc_return(&conn->serial); + sp->hdr.type = RXRPC_PACKET_TYPE_DATA; + sp->hdr.userStatus = 0; + sp->hdr.securityIndex = conn->security_ix; + sp->hdr._rsvd = 0; + sp->hdr.serviceId = call->service_id; + + sp->hdr.flags = conn->out_clientflag; + if (msg_data_left(msg) == 0 && !more) + sp->hdr.flags |= RXRPC_LAST_PACKET; + else if (CIRC_SPACE(call->acks_head, + ACCESS_ONCE(call->acks_tail), + call->acks_winsz) > 1) + sp->hdr.flags |= RXRPC_MORE_PACKETS; + if (more && seq & 1) + sp->hdr.flags |= RXRPC_REQUEST_ACK; + + ret = conn->security->secure_packet( + call, skb, skb->mark, + skb->head + sizeof(struct rxrpc_wire_header)); + if (ret < 0) + goto out; + + rxrpc_insert_header(skb); + rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); + skb = NULL; + } + } while (msg_data_left(msg) > 0); + +success: + ret = copied; +out: + call->tx_pending = skb; + _leave(" = %d", ret); + return ret; + +call_aborted: + rxrpc_free_skb(skb); + if (call->state == RXRPC_CALL_NETWORK_ERROR) + ret = call->conn->trans->peer->net_error; + else + ret = -ECONNABORTED; + _leave(" = %d", ret); + return ret; + +maybe_error: + if (copied) + goto success; + goto out; + +efault: + ret = -EFAULT; + goto out; +} diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c new file mode 100644 index 000000000000..3e82d6f0313c --- /dev/null +++ b/net/rxrpc/peer_event.c @@ -0,0 +1,230 @@ +/* Error message handling (ICMP) + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * handle an error received on the local endpoint + */ +void rxrpc_UDP_error_report(struct sock *sk) +{ + struct sock_exterr_skb *serr; + struct rxrpc_transport *trans; + struct rxrpc_local *local = sk->sk_user_data; + struct rxrpc_peer *peer; + struct sk_buff *skb; + __be32 addr; + __be16 port; + + _enter("%p{%d}", sk, local->debug_id); + + skb = sock_dequeue_err_skb(sk); + if (!skb) { + _leave("UDP socket errqueue empty"); + return; + } + serr = SKB_EXT_ERR(skb); + if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { + _leave("UDP empty message"); + kfree_skb(skb); + return; + } + + rxrpc_new_skb(skb); + + addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); + port = serr->port; + + _net("Rx UDP Error from %pI4:%hu", &addr, ntohs(port)); + _debug("Msg l:%d d:%d", skb->len, skb->data_len); + + peer = rxrpc_find_peer(local, addr, port); + if (IS_ERR(peer)) { + rxrpc_free_skb(skb); + _leave(" [no peer]"); + return; + } + + trans = rxrpc_find_transport(local, peer); + if (!trans) { + rxrpc_put_peer(peer); + rxrpc_free_skb(skb); + _leave(" [no trans]"); + return; + } + + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && + serr->ee.ee_type == ICMP_DEST_UNREACH && + serr->ee.ee_code == ICMP_FRAG_NEEDED + ) { + u32 mtu = serr->ee.ee_info; + + _net("Rx Received ICMP Fragmentation Needed (%d)", mtu); + + /* wind down the local interface MTU */ + if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) { + peer->if_mtu = mtu; + _net("I/F MTU %u", mtu); + } + + if (mtu == 0) { + /* they didn't give us a size, estimate one */ + mtu = peer->if_mtu; + if (mtu > 1500) { + mtu >>= 1; + if (mtu < 1500) + mtu = 1500; + } else { + mtu -= 100; + if (mtu < peer->hdrsize) + mtu = peer->hdrsize + 4; + } + } + + if (mtu < peer->mtu) { + spin_lock_bh(&peer->lock); + peer->mtu = mtu; + peer->maxdata = peer->mtu - peer->hdrsize; + spin_unlock_bh(&peer->lock); + _net("Net MTU %u (maxdata %u)", + peer->mtu, peer->maxdata); + } + } + + rxrpc_put_peer(peer); + + /* pass the transport ref to error_handler to release */ + skb_queue_tail(&trans->error_queue, skb); + rxrpc_queue_work(&trans->error_handler); + _leave(""); +} + +/* + * deal with UDP error messages + */ +void rxrpc_UDP_error_handler(struct work_struct *work) +{ + struct sock_extended_err *ee; + struct sock_exterr_skb *serr; + struct rxrpc_transport *trans = + container_of(work, struct rxrpc_transport, error_handler); + struct sk_buff *skb; + int err; + + _enter(""); + + skb = skb_dequeue(&trans->error_queue); + if (!skb) + return; + + serr = SKB_EXT_ERR(skb); + ee = &serr->ee; + + _net("Rx Error o=%d t=%d c=%d e=%d", + ee->ee_origin, ee->ee_type, ee->ee_code, ee->ee_errno); + + err = ee->ee_errno; + + switch (ee->ee_origin) { + case SO_EE_ORIGIN_ICMP: + switch (ee->ee_type) { + case ICMP_DEST_UNREACH: + switch (ee->ee_code) { + case ICMP_NET_UNREACH: + _net("Rx Received ICMP Network Unreachable"); + break; + case ICMP_HOST_UNREACH: + _net("Rx Received ICMP Host Unreachable"); + break; + case ICMP_PORT_UNREACH: + _net("Rx Received ICMP Port Unreachable"); + break; + case ICMP_NET_UNKNOWN: + _net("Rx Received ICMP Unknown Network"); + break; + case ICMP_HOST_UNKNOWN: + _net("Rx Received ICMP Unknown Host"); + break; + default: + _net("Rx Received ICMP DestUnreach code=%u", + ee->ee_code); + break; + } + break; + + case ICMP_TIME_EXCEEDED: + _net("Rx Received ICMP TTL Exceeded"); + break; + + default: + _proto("Rx Received ICMP error { type=%u code=%u }", + ee->ee_type, ee->ee_code); + break; + } + break; + + case SO_EE_ORIGIN_LOCAL: + _proto("Rx Received local error { error=%d }", + ee->ee_errno); + break; + + case SO_EE_ORIGIN_NONE: + case SO_EE_ORIGIN_ICMP6: + default: + _proto("Rx Received error report { orig=%u }", + ee->ee_origin); + break; + } + + /* terminate all the affected calls if there's an unrecoverable + * error */ + if (err) { + struct rxrpc_call *call, *_n; + + _debug("ISSUE ERROR %d", err); + + spin_lock_bh(&trans->peer->lock); + trans->peer->net_error = err; + + list_for_each_entry_safe(call, _n, &trans->peer->error_targets, + error_link) { + write_lock(&call->state_lock); + if (call->state != RXRPC_CALL_COMPLETE && + call->state < RXRPC_CALL_NETWORK_ERROR) { + call->state = RXRPC_CALL_NETWORK_ERROR; + set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); + list_del_init(&call->error_link); + } + + spin_unlock_bh(&trans->peer->lock); + } + + if (!skb_queue_empty(&trans->error_queue)) + rxrpc_queue_work(&trans->error_handler); + + rxrpc_free_skb(skb); + rxrpc_put_transport(trans); + _leave(""); +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c new file mode 100644 index 000000000000..0b54cda3d8e5 --- /dev/null +++ b/net/rxrpc/peer_object.c @@ -0,0 +1,305 @@ +/* RxRPC remote transport endpoint management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static LIST_HEAD(rxrpc_peers); +static DEFINE_RWLOCK(rxrpc_peer_lock); +static DECLARE_WAIT_QUEUE_HEAD(rxrpc_peer_wq); + +static void rxrpc_destroy_peer(struct work_struct *work); + +/* + * assess the MTU size for the network interface through which this peer is + * reached + */ +static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) +{ + struct rtable *rt; + struct flowi4 fl4; + + peer->if_mtu = 1500; + + rt = ip_route_output_ports(&init_net, &fl4, NULL, + peer->srx.transport.sin.sin_addr.s_addr, 0, + htons(7000), htons(7001), + IPPROTO_UDP, 0, 0); + if (IS_ERR(rt)) { + _leave(" [route err %ld]", PTR_ERR(rt)); + return; + } + + peer->if_mtu = dst_mtu(&rt->dst); + dst_release(&rt->dst); + + _leave(" [if_mtu %u]", peer->if_mtu); +} + +/* + * allocate a new peer + */ +static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + struct rxrpc_peer *peer; + + _enter(""); + + peer = kzalloc(sizeof(struct rxrpc_peer), gfp); + if (peer) { + INIT_WORK(&peer->destroyer, &rxrpc_destroy_peer); + INIT_LIST_HEAD(&peer->link); + INIT_LIST_HEAD(&peer->error_targets); + spin_lock_init(&peer->lock); + atomic_set(&peer->usage, 1); + peer->debug_id = atomic_inc_return(&rxrpc_debug_id); + memcpy(&peer->srx, srx, sizeof(*srx)); + + rxrpc_assess_MTU_size(peer); + peer->mtu = peer->if_mtu; + + if (srx->transport.family == AF_INET) { + peer->hdrsize = sizeof(struct iphdr); + switch (srx->transport_type) { + case SOCK_DGRAM: + peer->hdrsize += sizeof(struct udphdr); + break; + default: + BUG(); + break; + } + } else { + BUG(); + } + + peer->hdrsize += sizeof(struct rxrpc_wire_header); + peer->maxdata = peer->mtu - peer->hdrsize; + } + + _leave(" = %p", peer); + return peer; +} + +/* + * obtain a remote transport endpoint for the specified address + */ +struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *srx, gfp_t gfp) +{ + struct rxrpc_peer *peer, *candidate; + const char *new = "old"; + int usage; + + _enter("{%d,%d,%pI4+%hu}", + srx->transport_type, + srx->transport_len, + &srx->transport.sin.sin_addr, + ntohs(srx->transport.sin.sin_port)); + + /* search the peer list first */ + read_lock_bh(&rxrpc_peer_lock); + list_for_each_entry(peer, &rxrpc_peers, link) { + _debug("check PEER %d { u=%d t=%d l=%d }", + peer->debug_id, + atomic_read(&peer->usage), + peer->srx.transport_type, + peer->srx.transport_len); + + if (atomic_read(&peer->usage) > 0 && + peer->srx.transport_type == srx->transport_type && + peer->srx.transport_len == srx->transport_len && + memcmp(&peer->srx.transport, + &srx->transport, + srx->transport_len) == 0) + goto found_extant_peer; + } + read_unlock_bh(&rxrpc_peer_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_peer(srx, gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + write_lock_bh(&rxrpc_peer_lock); + + list_for_each_entry(peer, &rxrpc_peers, link) { + if (atomic_read(&peer->usage) > 0 && + peer->srx.transport_type == srx->transport_type && + peer->srx.transport_len == srx->transport_len && + memcmp(&peer->srx.transport, + &srx->transport, + srx->transport_len) == 0) + goto found_extant_second; + } + + /* we can now add the new candidate to the list */ + peer = candidate; + candidate = NULL; + usage = atomic_read(&peer->usage); + + list_add_tail(&peer->link, &rxrpc_peers); + write_unlock_bh(&rxrpc_peer_lock); + new = "new"; + +success: + _net("PEER %s %d {%d,%u,%pI4+%hu}", + new, + peer->debug_id, + peer->srx.transport_type, + peer->srx.transport.family, + &peer->srx.transport.sin.sin_addr, + ntohs(peer->srx.transport.sin.sin_port)); + + _leave(" = %p {u=%d}", peer, usage); + return peer; + + /* we found the peer in the list immediately */ +found_extant_peer: + usage = atomic_inc_return(&peer->usage); + read_unlock_bh(&rxrpc_peer_lock); + goto success; + + /* we found the peer on the second time through the list */ +found_extant_second: + usage = atomic_inc_return(&peer->usage); + write_unlock_bh(&rxrpc_peer_lock); + kfree(candidate); + goto success; +} + +/* + * find the peer associated with a packet + */ +struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *local, + __be32 addr, __be16 port) +{ + struct rxrpc_peer *peer; + + _enter(""); + + /* search the peer list */ + read_lock_bh(&rxrpc_peer_lock); + + if (local->srx.transport.family == AF_INET && + local->srx.transport_type == SOCK_DGRAM + ) { + list_for_each_entry(peer, &rxrpc_peers, link) { + if (atomic_read(&peer->usage) > 0 && + peer->srx.transport_type == SOCK_DGRAM && + peer->srx.transport.family == AF_INET && + peer->srx.transport.sin.sin_port == port && + peer->srx.transport.sin.sin_addr.s_addr == addr) + goto found_UDP_peer; + } + + goto new_UDP_peer; + } + + read_unlock_bh(&rxrpc_peer_lock); + _leave(" = -EAFNOSUPPORT"); + return ERR_PTR(-EAFNOSUPPORT); + +found_UDP_peer: + _net("Rx UDP DGRAM from peer %d", peer->debug_id); + atomic_inc(&peer->usage); + read_unlock_bh(&rxrpc_peer_lock); + _leave(" = %p", peer); + return peer; + +new_UDP_peer: + _net("Rx UDP DGRAM from NEW peer"); + read_unlock_bh(&rxrpc_peer_lock); + _leave(" = -EBUSY [new]"); + return ERR_PTR(-EBUSY); +} + +/* + * release a remote transport endpoint + */ +void rxrpc_put_peer(struct rxrpc_peer *peer) +{ + _enter("%p{u=%d}", peer, atomic_read(&peer->usage)); + + ASSERTCMP(atomic_read(&peer->usage), >, 0); + + if (likely(!atomic_dec_and_test(&peer->usage))) { + _leave(" [in use]"); + return; + } + + rxrpc_queue_work(&peer->destroyer); + _leave(""); +} + +/* + * destroy a remote transport endpoint + */ +static void rxrpc_destroy_peer(struct work_struct *work) +{ + struct rxrpc_peer *peer = + container_of(work, struct rxrpc_peer, destroyer); + + _enter("%p{%d}", peer, atomic_read(&peer->usage)); + + write_lock_bh(&rxrpc_peer_lock); + list_del(&peer->link); + write_unlock_bh(&rxrpc_peer_lock); + + _net("DESTROY PEER %d", peer->debug_id); + kfree(peer); + + if (list_empty(&rxrpc_peers)) + wake_up_all(&rxrpc_peer_wq); + _leave(""); +} + +/* + * preemptively destroy all the peer records from a transport endpoint rather + * than waiting for them to time out + */ +void __exit rxrpc_destroy_all_peers(void) +{ + DECLARE_WAITQUEUE(myself,current); + + _enter(""); + + /* we simply have to wait for them to go away */ + if (!list_empty(&rxrpc_peers)) { + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&rxrpc_peer_wq, &myself); + + while (!list_empty(&rxrpc_peers)) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + remove_wait_queue(&rxrpc_peer_wq, &myself); + set_current_state(TASK_RUNNING); + } + + _leave(""); +} diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c new file mode 100644 index 000000000000..225163bc658d --- /dev/null +++ b/net/rxrpc/proc.c @@ -0,0 +1,192 @@ +/* /proc/net/ support for AF_RXRPC + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include "ar-internal.h" + +static const char *const rxrpc_conn_states[] = { + [RXRPC_CONN_UNUSED] = "Unused ", + [RXRPC_CONN_CLIENT] = "Client ", + [RXRPC_CONN_SERVER_UNSECURED] = "SvUnsec ", + [RXRPC_CONN_SERVER_CHALLENGING] = "SvChall ", + [RXRPC_CONN_SERVER] = "SvSecure", + [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort", + [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CONN_NETWORK_ERROR] = "NetError", +}; + +/* + * generate a list of extant and dead calls in /proc/net/rxrpc_calls + */ +static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos) +{ + read_lock(&rxrpc_call_lock); + return seq_list_start_head(&rxrpc_calls, *_pos); +} + +static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &rxrpc_calls, pos); +} + +static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&rxrpc_call_lock); +} + +static int rxrpc_call_seq_show(struct seq_file *seq, void *v) +{ + struct rxrpc_transport *trans; + struct rxrpc_call *call; + char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; + + if (v == &rxrpc_calls) { + seq_puts(seq, + "Proto Local Remote " + " SvID ConnID CallID End Use State Abort " + " UserID\n"); + return 0; + } + + call = list_entry(v, struct rxrpc_call, link); + trans = call->conn->trans; + + sprintf(lbuff, "%pI4:%u", + &trans->local->srx.transport.sin.sin_addr, + ntohs(trans->local->srx.transport.sin.sin_port)); + + sprintf(rbuff, "%pI4:%u", + &trans->peer->srx.transport.sin.sin_addr, + ntohs(trans->peer->srx.transport.sin.sin_port)); + + seq_printf(seq, + "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" + " %-8.8s %08x %lx\n", + lbuff, + rbuff, + call->conn->service_id, + call->cid, + call->call_id, + call->conn->in_clientflag ? "Svc" : "Clt", + atomic_read(&call->usage), + rxrpc_call_states[call->state], + call->remote_abort ?: call->local_abort, + call->user_call_ID); + + return 0; +} + +static const struct seq_operations rxrpc_call_seq_ops = { + .start = rxrpc_call_seq_start, + .next = rxrpc_call_seq_next, + .stop = rxrpc_call_seq_stop, + .show = rxrpc_call_seq_show, +}; + +static int rxrpc_call_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rxrpc_call_seq_ops); +} + +const struct file_operations rxrpc_call_seq_fops = { + .owner = THIS_MODULE, + .open = rxrpc_call_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * generate a list of extant virtual connections in /proc/net/rxrpc_conns + */ +static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos) +{ + read_lock(&rxrpc_connection_lock); + return seq_list_start_head(&rxrpc_connections, *_pos); +} + +static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + return seq_list_next(v, &rxrpc_connections, pos); +} + +static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&rxrpc_connection_lock); +} + +static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) +{ + struct rxrpc_connection *conn; + struct rxrpc_transport *trans; + char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; + + if (v == &rxrpc_connections) { + seq_puts(seq, + "Proto Local Remote " + " SvID ConnID Calls End Use State Key " + " Serial ISerial\n" + ); + return 0; + } + + conn = list_entry(v, struct rxrpc_connection, link); + trans = conn->trans; + + sprintf(lbuff, "%pI4:%u", + &trans->local->srx.transport.sin.sin_addr, + ntohs(trans->local->srx.transport.sin.sin_port)); + + sprintf(rbuff, "%pI4:%u", + &trans->peer->srx.transport.sin.sin_addr, + ntohs(trans->peer->srx.transport.sin.sin_port)); + + seq_printf(seq, + "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" + " %s %08x %08x %08x\n", + lbuff, + rbuff, + conn->service_id, + conn->cid, + conn->call_counter, + conn->in_clientflag ? "Svc" : "Clt", + atomic_read(&conn->usage), + rxrpc_conn_states[conn->state], + key_serial(conn->key), + atomic_read(&conn->serial), + atomic_read(&conn->hi_serial)); + + return 0; +} + +static const struct seq_operations rxrpc_connection_seq_ops = { + .start = rxrpc_connection_seq_start, + .next = rxrpc_connection_seq_next, + .stop = rxrpc_connection_seq_stop, + .show = rxrpc_connection_seq_show, +}; + + +static int rxrpc_connection_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rxrpc_connection_seq_ops); +} + +const struct file_operations rxrpc_connection_seq_fops = { + .owner = THIS_MODULE, + .open = rxrpc_connection_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c new file mode 100644 index 000000000000..59706b9f2f7a --- /dev/null +++ b/net/rxrpc/recvmsg.c @@ -0,0 +1,436 @@ +/* RxRPC recvmsg() implementation + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * removal a call's user ID from the socket tree to make the user ID available + * again and so that it won't be seen again in association with that call + */ +void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call) +{ + _debug("RELEASE CALL %d", call->debug_id); + + if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + write_lock_bh(&rx->call_lock); + rb_erase(&call->sock_node, &call->socket->calls); + clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); + write_unlock_bh(&rx->call_lock); + } + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} + +/* + * receive a message from an RxRPC socket + * - we need to be careful about two or more threads calling recvmsg + * simultaneously + */ +int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_call *call = NULL, *continue_call = NULL; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *skb; + long timeo; + int copy, ret, ullen, offset, copied = 0; + u32 abort_code; + + DEFINE_WAIT(wait); + + _enter(",,,%zu,%d", len, flags); + + if (flags & (MSG_OOB | MSG_TRUNC)) + return -EOPNOTSUPP; + + ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long); + + timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); + msg->msg_flags |= MSG_MORE; + + lock_sock(&rx->sk); + + for (;;) { + /* return immediately if a client socket has no outstanding + * calls */ + if (RB_EMPTY_ROOT(&rx->calls)) { + if (copied) + goto out; + if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) { + release_sock(&rx->sk); + if (continue_call) + rxrpc_put_call(continue_call); + return -ENODATA; + } + } + + /* get the next message on the Rx queue */ + skb = skb_peek(&rx->sk.sk_receive_queue); + if (!skb) { + /* nothing remains on the queue */ + if (copied && + (flags & MSG_PEEK || timeo == 0)) + goto out; + + /* wait for a message to turn up */ + release_sock(&rx->sk); + prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, + TASK_INTERRUPTIBLE); + ret = sock_error(&rx->sk); + if (ret) + goto wait_error; + + if (skb_queue_empty(&rx->sk.sk_receive_queue)) { + if (signal_pending(current)) + goto wait_interrupted; + timeo = schedule_timeout(timeo); + } + finish_wait(sk_sleep(&rx->sk), &wait); + lock_sock(&rx->sk); + continue; + } + + peek_next_packet: + sp = rxrpc_skb(skb); + call = sp->call; + ASSERT(call != NULL); + + _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]); + + /* make sure we wait for the state to be updated in this call */ + spin_lock_bh(&call->lock); + spin_unlock_bh(&call->lock); + + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { + _debug("packet from released call"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + continue; + } + + /* determine whether to continue last data receive */ + if (continue_call) { + _debug("maybe cont"); + if (call != continue_call || + skb->mark != RXRPC_SKB_MARK_DATA) { + release_sock(&rx->sk); + rxrpc_put_call(continue_call); + _leave(" = %d [noncont]", copied); + return copied; + } + } + + rxrpc_get_call(call); + + /* copy the peer address and timestamp */ + if (!continue_call) { + if (msg->msg_name) { + size_t len = + sizeof(call->conn->trans->peer->srx); + memcpy(msg->msg_name, + &call->conn->trans->peer->srx, len); + msg->msg_namelen = len; + } + sock_recv_timestamp(msg, &rx->sk, skb); + } + + /* receive the message */ + if (skb->mark != RXRPC_SKB_MARK_DATA) + goto receive_non_data_message; + + _debug("recvmsg DATA #%u { %d, %d }", + sp->hdr.seq, skb->len, sp->offset); + + if (!continue_call) { + /* only set the control data once per recvmsg() */ + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + ullen, &call->user_call_ID); + if (ret < 0) + goto copy_error; + ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + } + + ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); + ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); + call->rx_data_recv = sp->hdr.seq; + + ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); + + offset = sp->offset; + copy = skb->len - offset; + if (copy > len - copied) + copy = len - copied; + + ret = skb_copy_datagram_msg(skb, offset, msg, copy); + + if (ret < 0) + goto copy_error; + + /* handle piecemeal consumption of data packets */ + _debug("copied %d+%d", copy, copied); + + offset += copy; + copied += copy; + + if (!(flags & MSG_PEEK)) + sp->offset = offset; + + if (sp->offset < skb->len) { + _debug("buffer full"); + ASSERTCMP(copied, ==, len); + break; + } + + /* we transferred the whole data packet */ + if (sp->hdr.flags & RXRPC_LAST_PACKET) { + _debug("last"); + if (call->conn->out_clientflag) { + /* last byte of reply received */ + ret = copied; + goto terminal_message; + } + + /* last bit of request received */ + if (!(flags & MSG_PEEK)) { + _debug("eat packet"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != + skb) + BUG(); + rxrpc_free_skb(skb); + } + msg->msg_flags &= ~MSG_MORE; + break; + } + + /* move on to the next data message */ + _debug("next"); + if (!continue_call) + continue_call = sp->call; + else + rxrpc_put_call(call); + call = NULL; + + if (flags & MSG_PEEK) { + _debug("peek next"); + skb = skb->next; + if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue) + break; + goto peek_next_packet; + } + + _debug("eat packet"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + } + + /* end of non-terminal data packet reception for the moment */ + _debug("end rcv data"); +out: + release_sock(&rx->sk); + if (call) + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d [data]", copied); + return copied; + + /* handle non-DATA messages such as aborts, incoming connections and + * final ACKs */ +receive_non_data_message: + _debug("non-data"); + + if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) { + _debug("RECV NEW CALL"); + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code); + if (ret < 0) + goto copy_error; + if (!(flags & MSG_PEEK)) { + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + } + goto out; + } + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + ullen, &call->user_call_ID); + if (ret < 0) + goto copy_error; + ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + + switch (skb->mark) { + case RXRPC_SKB_MARK_DATA: + BUG(); + case RXRPC_SKB_MARK_FINAL_ACK: + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code); + break; + case RXRPC_SKB_MARK_BUSY: + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code); + break; + case RXRPC_SKB_MARK_REMOTE_ABORT: + abort_code = call->remote_abort; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); + break; + case RXRPC_SKB_MARK_LOCAL_ABORT: + abort_code = call->local_abort; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); + break; + case RXRPC_SKB_MARK_NET_ERROR: + _debug("RECV NET ERROR %d", sp->error); + abort_code = sp->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code); + break; + case RXRPC_SKB_MARK_LOCAL_ERROR: + _debug("RECV LOCAL ERROR %d", sp->error); + abort_code = sp->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, + &abort_code); + break; + default: + pr_err("Unknown packet mark %u\n", skb->mark); + BUG(); + break; + } + + if (ret < 0) + goto copy_error; + +terminal_message: + _debug("terminal"); + msg->msg_flags &= ~MSG_MORE; + msg->msg_flags |= MSG_EOR; + + if (!(flags & MSG_PEEK)) { + _net("free terminal skb %p", skb); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + rxrpc_remove_user_ID(rx, call); + } + + release_sock(&rx->sk); + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d", ret); + return ret; + +copy_error: + _debug("copy error"); + release_sock(&rx->sk); + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d", ret); + return ret; + +wait_interrupted: + ret = sock_intr_errno(timeo); +wait_error: + finish_wait(sk_sleep(&rx->sk), &wait); + if (continue_call) + rxrpc_put_call(continue_call); + if (copied) + copied = ret; + _leave(" = %d [waitfail %d]", copied, ret); + return copied; + +} + +/** + * rxrpc_kernel_data_delivered - Record delivery of data message + * @skb: Message holding data + * + * Record the delivery of a data message. This permits RxRPC to keep its + * tracking correct. The socket buffer will be deleted. + */ +void rxrpc_kernel_data_delivered(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = sp->call; + + ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); + ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); + call->rx_data_recv = sp->hdr.seq; + + ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); + rxrpc_free_skb(skb); +} + +EXPORT_SYMBOL(rxrpc_kernel_data_delivered); + +/** + * rxrpc_kernel_is_data_last - Determine if data message is last one + * @skb: Message holding data + * + * Determine if data message is last one for the parent call. + */ +bool rxrpc_kernel_is_data_last(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA); + + return sp->hdr.flags & RXRPC_LAST_PACKET; +} + +EXPORT_SYMBOL(rxrpc_kernel_is_data_last); + +/** + * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message + * @skb: Message indicating an abort + * + * Get the abort code from an RxRPC abort message. + */ +u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + switch (skb->mark) { + case RXRPC_SKB_MARK_REMOTE_ABORT: + return sp->call->remote_abort; + case RXRPC_SKB_MARK_LOCAL_ABORT: + return sp->call->local_abort; + default: + BUG(); + } +} + +EXPORT_SYMBOL(rxrpc_kernel_get_abort_code); + +/** + * rxrpc_kernel_get_error - Get the error number from an RxRPC error message + * @skb: Message indicating an error + * + * Get the error number from an RxRPC error message. + */ +int rxrpc_kernel_get_error_number(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + return sp->error; +} + +EXPORT_SYMBOL(rxrpc_kernel_get_error_number); diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c new file mode 100644 index 000000000000..d223253b22fa --- /dev/null +++ b/net/rxrpc/security.c @@ -0,0 +1,168 @@ +/* RxRPC security handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static LIST_HEAD(rxrpc_security_methods); +static DECLARE_RWSEM(rxrpc_security_sem); + +static const struct rxrpc_security *rxrpc_security_types[] = { + [RXRPC_SECURITY_NONE] = &rxrpc_no_security, +#ifdef CONFIG_RXKAD + [RXRPC_SECURITY_RXKAD] = &rxkad, +#endif +}; + +int __init rxrpc_init_security(void) +{ + int i, ret; + + for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) { + if (rxrpc_security_types[i]) { + ret = rxrpc_security_types[i]->init(); + if (ret < 0) + goto failed; + } + } + + return 0; + +failed: + for (i--; i >= 0; i--) + if (rxrpc_security_types[i]) + rxrpc_security_types[i]->exit(); + return ret; +} + +void rxrpc_exit_security(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) + if (rxrpc_security_types[i]) + rxrpc_security_types[i]->exit(); +} + +/* + * look up an rxrpc security module + */ +static const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) +{ + if (security_index >= ARRAY_SIZE(rxrpc_security_types)) + return NULL; + return rxrpc_security_types[security_index]; +} + +/* + * initialise the security on a client connection + */ +int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) +{ + const struct rxrpc_security *sec; + struct rxrpc_key_token *token; + struct key *key = conn->key; + int ret; + + _enter("{%d},{%x}", conn->debug_id, key_serial(key)); + + if (!key) + return 0; + + ret = key_validate(key); + if (ret < 0) + return ret; + + token = key->payload.data[0]; + if (!token) + return -EKEYREJECTED; + + sec = rxrpc_security_lookup(token->security_index); + if (!sec) + return -EKEYREJECTED; + conn->security = sec; + + ret = conn->security->init_connection_security(conn); + if (ret < 0) { + conn->security = &rxrpc_no_security; + return ret; + } + + _leave(" = 0"); + return 0; +} + +/* + * initialise the security on a server connection + */ +int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) +{ + const struct rxrpc_security *sec; + struct rxrpc_local *local = conn->trans->local; + struct rxrpc_sock *rx; + struct key *key; + key_ref_t kref; + char kdesc[5 + 1 + 3 + 1]; + + _enter(""); + + sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix); + + sec = rxrpc_security_lookup(conn->security_ix); + if (!sec) { + _leave(" = -ENOKEY [lookup]"); + return -ENOKEY; + } + + /* find the service */ + read_lock_bh(&local->services_lock); + list_for_each_entry(rx, &local->services, listen_link) { + if (rx->srx.srx_service == conn->service_id) + goto found_service; + } + + /* the service appears to have died */ + read_unlock_bh(&local->services_lock); + _leave(" = -ENOENT"); + return -ENOENT; + +found_service: + if (!rx->securities) { + read_unlock_bh(&local->services_lock); + _leave(" = -ENOKEY"); + return -ENOKEY; + } + + /* look through the service's keyring */ + kref = keyring_search(make_key_ref(rx->securities, 1UL), + &key_type_rxrpc_s, kdesc); + if (IS_ERR(kref)) { + read_unlock_bh(&local->services_lock); + _leave(" = %ld [search]", PTR_ERR(kref)); + return PTR_ERR(kref); + } + + key = key_ref_to_ptr(kref); + read_unlock_bh(&local->services_lock); + + conn->server_key = key; + conn->security = sec; + + _leave(" = 0"); + return 0; +} diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c new file mode 100644 index 000000000000..eee0cfd9ac8c --- /dev/null +++ b/net/rxrpc/skbuff.c @@ -0,0 +1,138 @@ +/* ar-skbuff.c: socket buffer destruction handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * set up for the ACK at the end of the receive phase when we discard the final + * receive phase data packet + * - called with softirqs disabled + */ +static void rxrpc_request_final_ACK(struct rxrpc_call *call) +{ + /* the call may be aborted before we have a chance to ACK it */ + write_lock(&call->state_lock); + + switch (call->state) { + case RXRPC_CALL_CLIENT_RECV_REPLY: + call->state = RXRPC_CALL_CLIENT_FINAL_ACK; + _debug("request final ACK"); + + /* get an extra ref on the call for the final-ACK generator to + * release */ + rxrpc_get_call(call); + set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); + if (try_to_del_timer_sync(&call->ack_timer) >= 0) + rxrpc_queue_call(call); + break; + + case RXRPC_CALL_SERVER_RECV_REQUEST: + call->state = RXRPC_CALL_SERVER_ACK_REQUEST; + default: + break; + } + + write_unlock(&call->state_lock); +} + +/* + * drop the bottom ACK off of the call ACK window and advance the window + */ +static void rxrpc_hard_ACK_data(struct rxrpc_call *call, + struct rxrpc_skb_priv *sp) +{ + int loop; + u32 seq; + + spin_lock_bh(&call->lock); + + _debug("hard ACK #%u", sp->hdr.seq); + + for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { + call->ackr_window[loop] >>= 1; + call->ackr_window[loop] |= + call->ackr_window[loop + 1] << (BITS_PER_LONG - 1); + } + + seq = sp->hdr.seq; + ASSERTCMP(seq, ==, call->rx_data_eaten + 1); + call->rx_data_eaten = seq; + + if (call->ackr_win_top < UINT_MAX) + call->ackr_win_top++; + + ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, + call->rx_data_post, >=, call->rx_data_recv); + ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, + call->rx_data_recv, >=, call->rx_data_eaten); + + if (sp->hdr.flags & RXRPC_LAST_PACKET) { + rxrpc_request_final_ACK(call); + } else if (atomic_dec_and_test(&call->ackr_not_idle) && + test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) { + /* We previously soft-ACK'd some received packets that have now + * been consumed, so send a hard-ACK if no more packets are + * immediately forthcoming to allow the transmitter to free up + * its Tx bufferage. + */ + _debug("send Rx idle ACK"); + __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial, + false); + } + + spin_unlock_bh(&call->lock); +} + +/* + * destroy a packet that has an RxRPC control buffer + * - advance the hard-ACK state of the parent call (done here in case something + * in the kernel bypasses recvmsg() and steals the packet directly off of the + * socket receive queue) + */ +void rxrpc_packet_destructor(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = sp->call; + + _enter("%p{%p}", skb, call); + + if (call) { + /* send the final ACK on a client call */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) + rxrpc_hard_ACK_data(call, sp); + rxrpc_put_call(call); + sp->call = NULL; + } + + if (skb->sk) + sock_rfree(skb); + _leave(""); +} + +/** + * rxrpc_kernel_free_skb - Free an RxRPC socket buffer + * @skb: The socket buffer to be freed + * + * Let RxRPC free its own socket buffer, permitting it to maintain debug + * accounting. + */ +void rxrpc_kernel_free_skb(struct sk_buff *skb) +{ + rxrpc_free_skb(skb); +} +EXPORT_SYMBOL(rxrpc_kernel_free_skb); diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c new file mode 100644 index 000000000000..a1b65183b07d --- /dev/null +++ b/net/rxrpc/transport.c @@ -0,0 +1,286 @@ +/* RxRPC point-to-point transport session management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * Time after last use at which transport record is cleaned up. + */ +unsigned int rxrpc_transport_expiry = 3600 * 24; + +static void rxrpc_transport_reaper(struct work_struct *work); + +static LIST_HEAD(rxrpc_transports); +static DEFINE_RWLOCK(rxrpc_transport_lock); +static DECLARE_DELAYED_WORK(rxrpc_transport_reap, rxrpc_transport_reaper); + +/* + * allocate a new transport session manager + */ +static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local, + struct rxrpc_peer *peer, + gfp_t gfp) +{ + struct rxrpc_transport *trans; + + _enter(""); + + trans = kzalloc(sizeof(struct rxrpc_transport), gfp); + if (trans) { + trans->local = local; + trans->peer = peer; + INIT_LIST_HEAD(&trans->link); + trans->bundles = RB_ROOT; + trans->client_conns = RB_ROOT; + trans->server_conns = RB_ROOT; + skb_queue_head_init(&trans->error_queue); + spin_lock_init(&trans->client_lock); + rwlock_init(&trans->conn_lock); + atomic_set(&trans->usage, 1); + trans->conn_idcounter = peer->srx.srx_service << 16; + trans->debug_id = atomic_inc_return(&rxrpc_debug_id); + + if (peer->srx.transport.family == AF_INET) { + switch (peer->srx.transport_type) { + case SOCK_DGRAM: + INIT_WORK(&trans->error_handler, + rxrpc_UDP_error_handler); + break; + default: + BUG(); + break; + } + } else { + BUG(); + } + } + + _leave(" = %p", trans); + return trans; +} + +/* + * obtain a transport session for the nominated endpoints + */ +struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *local, + struct rxrpc_peer *peer, + gfp_t gfp) +{ + struct rxrpc_transport *trans, *candidate; + const char *new = "old"; + int usage; + + _enter("{%pI4+%hu},{%pI4+%hu},", + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port), + &peer->srx.transport.sin.sin_addr, + ntohs(peer->srx.transport.sin.sin_port)); + + /* search the transport list first */ + read_lock_bh(&rxrpc_transport_lock); + list_for_each_entry(trans, &rxrpc_transports, link) { + if (trans->local == local && trans->peer == peer) + goto found_extant_transport; + } + read_unlock_bh(&rxrpc_transport_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_transport(local, peer, gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + write_lock_bh(&rxrpc_transport_lock); + + list_for_each_entry(trans, &rxrpc_transports, link) { + if (trans->local == local && trans->peer == peer) + goto found_extant_second; + } + + /* we can now add the new candidate to the list */ + trans = candidate; + candidate = NULL; + usage = atomic_read(&trans->usage); + + rxrpc_get_local(trans->local); + atomic_inc(&trans->peer->usage); + list_add_tail(&trans->link, &rxrpc_transports); + write_unlock_bh(&rxrpc_transport_lock); + new = "new"; + +success: + _net("TRANSPORT %s %d local %d -> peer %d", + new, + trans->debug_id, + trans->local->debug_id, + trans->peer->debug_id); + + _leave(" = %p {u=%d}", trans, usage); + return trans; + + /* we found the transport in the list immediately */ +found_extant_transport: + usage = atomic_inc_return(&trans->usage); + read_unlock_bh(&rxrpc_transport_lock); + goto success; + + /* we found the transport on the second time through the list */ +found_extant_second: + usage = atomic_inc_return(&trans->usage); + write_unlock_bh(&rxrpc_transport_lock); + kfree(candidate); + goto success; +} + +/* + * find the transport connecting two endpoints + */ +struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *local, + struct rxrpc_peer *peer) +{ + struct rxrpc_transport *trans; + + _enter("{%pI4+%hu},{%pI4+%hu},", + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port), + &peer->srx.transport.sin.sin_addr, + ntohs(peer->srx.transport.sin.sin_port)); + + /* search the transport list */ + read_lock_bh(&rxrpc_transport_lock); + + list_for_each_entry(trans, &rxrpc_transports, link) { + if (trans->local == local && trans->peer == peer) + goto found_extant_transport; + } + + read_unlock_bh(&rxrpc_transport_lock); + _leave(" = NULL"); + return NULL; + +found_extant_transport: + atomic_inc(&trans->usage); + read_unlock_bh(&rxrpc_transport_lock); + _leave(" = %p", trans); + return trans; +} + +/* + * release a transport session + */ +void rxrpc_put_transport(struct rxrpc_transport *trans) +{ + _enter("%p{u=%d}", trans, atomic_read(&trans->usage)); + + ASSERTCMP(atomic_read(&trans->usage), >, 0); + + trans->put_time = ktime_get_seconds(); + if (unlikely(atomic_dec_and_test(&trans->usage))) { + _debug("zombie"); + /* let the reaper determine the timeout to avoid a race with + * overextending the timeout if the reaper is running at the + * same time */ + rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); + } + _leave(""); +} + +/* + * clean up a transport session + */ +static void rxrpc_cleanup_transport(struct rxrpc_transport *trans) +{ + _net("DESTROY TRANS %d", trans->debug_id); + + rxrpc_purge_queue(&trans->error_queue); + + rxrpc_put_local(trans->local); + rxrpc_put_peer(trans->peer); + kfree(trans); +} + +/* + * reap dead transports that have passed their expiry date + */ +static void rxrpc_transport_reaper(struct work_struct *work) +{ + struct rxrpc_transport *trans, *_p; + unsigned long now, earliest, reap_time; + + LIST_HEAD(graveyard); + + _enter(""); + + now = ktime_get_seconds(); + earliest = ULONG_MAX; + + /* extract all the transports that have been dead too long */ + write_lock_bh(&rxrpc_transport_lock); + list_for_each_entry_safe(trans, _p, &rxrpc_transports, link) { + _debug("reap TRANS %d { u=%d t=%ld }", + trans->debug_id, atomic_read(&trans->usage), + (long) now - (long) trans->put_time); + + if (likely(atomic_read(&trans->usage) > 0)) + continue; + + reap_time = trans->put_time + rxrpc_transport_expiry; + if (reap_time <= now) + list_move_tail(&trans->link, &graveyard); + else if (reap_time < earliest) + earliest = reap_time; + } + write_unlock_bh(&rxrpc_transport_lock); + + if (earliest != ULONG_MAX) { + _debug("reschedule reaper %ld", (long) earliest - now); + ASSERTCMP(earliest, >, now); + rxrpc_queue_delayed_work(&rxrpc_transport_reap, + (earliest - now) * HZ); + } + + /* then destroy all those pulled out */ + while (!list_empty(&graveyard)) { + trans = list_entry(graveyard.next, struct rxrpc_transport, + link); + list_del_init(&trans->link); + + ASSERTCMP(atomic_read(&trans->usage), ==, 0); + rxrpc_cleanup_transport(trans); + } + + _leave(""); +} + +/* + * preemptively destroy all the transport session records rather than waiting + * for them to time out + */ +void __exit rxrpc_destroy_all_transports(void) +{ + _enter(""); + + rxrpc_transport_expiry = 0; + cancel_delayed_work(&rxrpc_transport_reap); + rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); + + _leave(""); +} -- cgit v1.2.3 From 0d81a51ab94a536a9fb742a5546b6d011ed26c7f Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 13 Jun 2016 13:30:30 +0100 Subject: rxrpc: Update the comments in ar-internal.h to reflect renames Update the section comments in ar-internal.h that indicate the locations of the referenced items to reflect the renames done to the .c files in net/rxrpc/. This also involves some rearrangement to reflect keep the sections in order of filename. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 138 ++++++++++++++++++++++++------------------------ 1 file changed, 69 insertions(+), 69 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index f715cca767cd..03919b9a8a31 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -482,21 +482,21 @@ extern struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_sock *, int, int, gfp_t); /* - * ar-accept.c + * call_accept.c */ void rxrpc_accept_incoming_calls(struct work_struct *); struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long); int rxrpc_reject_call(struct rxrpc_sock *); /* - * ar-ack.c + * call_event.c */ void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); void rxrpc_process_call(struct work_struct *); /* - * ar-call.c + * call_object.c */ extern unsigned int rxrpc_max_call_lifetime; extern unsigned int rxrpc_dead_call_expiry; @@ -520,7 +520,14 @@ void __rxrpc_put_call(struct rxrpc_call *); void __exit rxrpc_destroy_all_calls(void); /* - * ar-connection.c + * conn_event.c + */ +void rxrpc_process_connection(struct work_struct *); +void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *); +void rxrpc_reject_packets(struct work_struct *); + +/* + * conn_object.c */ extern unsigned int rxrpc_connection_expiry; extern struct list_head rxrpc_connections; @@ -540,27 +547,30 @@ extern struct rxrpc_connection * rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *); /* - * ar-connevent.c + * input.c */ -void rxrpc_process_connection(struct work_struct *); -void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *); -void rxrpc_reject_packets(struct work_struct *); +void rxrpc_data_ready(struct sock *); +int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool); +void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); /* - * ar-error.c + * insecure.c */ -void rxrpc_UDP_error_report(struct sock *); -void rxrpc_UDP_error_handler(struct work_struct *); +extern const struct rxrpc_security rxrpc_no_security; /* - * ar-input.c + * key.c */ -void rxrpc_data_ready(struct sock *); -int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool); -void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); +extern struct key_type key_type_rxrpc; +extern struct key_type key_type_rxrpc_s; + +int rxrpc_request_key(struct rxrpc_sock *, char __user *, int); +int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int); +int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t, + u32); /* - * ar-local.c + * local_object.c */ extern rwlock_t rxrpc_local_lock; @@ -569,18 +579,23 @@ void rxrpc_put_local(struct rxrpc_local *); void __exit rxrpc_destroy_all_locals(void); /* - * ar-key.c + * misc.c */ -extern struct key_type key_type_rxrpc; -extern struct key_type key_type_rxrpc_s; +extern unsigned int rxrpc_max_backlog __read_mostly; +extern unsigned int rxrpc_requested_ack_delay; +extern unsigned int rxrpc_soft_ack_delay; +extern unsigned int rxrpc_idle_ack_delay; +extern unsigned int rxrpc_rx_window_size; +extern unsigned int rxrpc_rx_mtu; +extern unsigned int rxrpc_rx_jumbo_max; -int rxrpc_request_key(struct rxrpc_sock *, char __user *, int); -int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int); -int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t, - u32); +extern const char *const rxrpc_pkts[]; +extern const s8 rxrpc_ack_priority[]; + +extern const char *rxrpc_acks(u8 reason); /* - * ar-output.c + * output.c */ extern unsigned int rxrpc_resend_timeout; @@ -588,7 +603,13 @@ int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* - * ar-peer.c + * peer_error.c + */ +void rxrpc_UDP_error_report(struct sock *); +void rxrpc_UDP_error_handler(struct work_struct *); + +/* + * peer_object.c */ struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *, gfp_t); void rxrpc_put_peer(struct rxrpc_peer *); @@ -596,20 +617,27 @@ struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *, __be32, __be16); void __exit rxrpc_destroy_all_peers(void); /* - * ar-proc.c + * proc.c */ extern const char *const rxrpc_call_states[]; extern const struct file_operations rxrpc_call_seq_fops; extern const struct file_operations rxrpc_connection_seq_fops; /* - * ar-recvmsg.c + * recvmsg.c */ void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* - * ar-security.c + * rxkad.c + */ +#ifdef CONFIG_RXKAD +extern const struct rxrpc_security rxkad; +#endif + +/* + * security.c */ int __init rxrpc_init_security(void); void rxrpc_exit_security(void); @@ -617,50 +645,10 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *); int rxrpc_init_server_conn_security(struct rxrpc_connection *); /* - * ar-skbuff.c + * skbuff.c */ void rxrpc_packet_destructor(struct sk_buff *); -/* - * ar-transport.c - */ -extern unsigned int rxrpc_transport_expiry; - -struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *, - struct rxrpc_peer *, gfp_t); -void rxrpc_put_transport(struct rxrpc_transport *); -void __exit rxrpc_destroy_all_transports(void); -struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *, - struct rxrpc_peer *); - -/* - * insecure.c - */ -extern const struct rxrpc_security rxrpc_no_security; - -/* - * misc.c - */ -extern unsigned int rxrpc_max_backlog __read_mostly; -extern unsigned int rxrpc_requested_ack_delay; -extern unsigned int rxrpc_soft_ack_delay; -extern unsigned int rxrpc_idle_ack_delay; -extern unsigned int rxrpc_rx_window_size; -extern unsigned int rxrpc_rx_mtu; -extern unsigned int rxrpc_rx_jumbo_max; - -extern const char *const rxrpc_pkts[]; -extern const s8 rxrpc_ack_priority[]; - -extern const char *rxrpc_acks(u8 reason); - -/* - * rxkad.c - */ -#ifdef CONFIG_RXKAD -extern const struct rxrpc_security rxkad; -#endif - /* * sysctl.c */ @@ -672,6 +660,18 @@ static inline int __init rxrpc_sysctl_init(void) { return 0; } static inline void rxrpc_sysctl_exit(void) {} #endif +/* + * transport.c + */ +extern unsigned int rxrpc_transport_expiry; + +struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *, + struct rxrpc_peer *, gfp_t); +void rxrpc_put_transport(struct rxrpc_transport *); +void __exit rxrpc_destroy_all_transports(void); +struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *, + struct rxrpc_peer *); + /* * debug tracing */ -- cgit v1.2.3 From e8eb36cd8ca93f52f738c6087073202c44ac7746 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Mon, 13 Jun 2016 12:06:39 +0300 Subject: net/sched: flower: Return error when hw can't offload and skip_sw is set When skip_sw is set and hardware fails to apply filter, return error to user. This will make error propagation logic similar to the one currently used in u32 classifier. Also, changed code to use tc_skip_sw() utility function. Signed-off-by: Amir Vadai Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1ea6f76e64b6..5060801a2f6d 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -140,7 +140,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, f = rhashtable_lookup_fast(&head->ht, fl_key_get_start(&skb_mkey, &head->mask), head->ht_params); - if (f && !(f->flags & TCA_CLS_FLAGS_SKIP_SW)) { + if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); } @@ -187,19 +187,20 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie) dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); } -static void fl_hw_replace_filter(struct tcf_proto *tp, - struct flow_dissector *dissector, - struct fl_flow_key *mask, - struct fl_flow_key *key, - struct tcf_exts *actions, - unsigned long cookie, u32 flags) +static int fl_hw_replace_filter(struct tcf_proto *tp, + struct flow_dissector *dissector, + struct fl_flow_key *mask, + struct fl_flow_key *key, + struct tcf_exts *actions, + unsigned long cookie, u32 flags) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload offload = {0}; struct tc_to_netdev tc; + int err; if (!tc_should_offload(dev, tp, flags)) - return; + return tc_skip_sw(flags) ? -EINVAL : 0; offload.command = TC_CLSFLOWER_REPLACE; offload.cookie = cookie; @@ -211,7 +212,12 @@ static void fl_hw_replace_filter(struct tcf_proto *tp, tc.type = TC_SETUP_CLSFLOWER; tc.cls_flower = &offload; - dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); + err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); + + if (tc_skip_sw(flags)) + return err; + + return 0; } static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) @@ -572,20 +578,22 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout; - if (!(fnew->flags & TCA_CLS_FLAGS_SKIP_SW)) { + if (!tc_skip_sw(fnew->flags)) { err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, head->ht_params); if (err) goto errout; } - fl_hw_replace_filter(tp, - &head->dissector, - &mask.key, - &fnew->key, - &fnew->exts, - (unsigned long)fnew, - fnew->flags); + err = fl_hw_replace_filter(tp, + &head->dissector, + &mask.key, + &fnew->key, + &fnew->exts, + (unsigned long)fnew, + fnew->flags); + if (err) + goto errout; if (fold) { rhashtable_remove_fast(&head->ht, &fold->ht_node, -- cgit v1.2.3 From a5e27d18fe64561a467b706f70cfc89ba6323f87 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 13 Jun 2016 23:08:26 +0800 Subject: sctp: fix error return code in sctp_init() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Signed-off-by: Wei Yongjun Acked-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/protocol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 40022ee885d7..3b56ae55aba3 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1479,7 +1479,8 @@ static __init int sctp_init(void) INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); } - if (sctp_transport_hashtable_init()) + status = sctp_transport_hashtable_init(); + if (status) goto err_thash_alloc; pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize, -- cgit v1.2.3 From dcf1158b275f9d51d6a742cf7166edc764ee4718 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 13 Jun 2016 11:20:35 -0400 Subject: tcp: return sizeof tcp_dctcp_info in dctcp_get_info() Make sure that dctcp_get_info() returns only the size of the info->dctcp struct that it zeroes out and fills in. Previously it had been returning the size of the enclosing tcp_cc_info union, sizeof(*info). There is no problem yet, but that union that may one day be larger than struct tcp_dctcp_info, in which case the TCP_CC_INFO code might accidentally copy uninitialized bytes from the stack. Signed-off-by: Neal Cardwell Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/ipv4/tcp_dctcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 7e538f71f5fb..10d728b6804c 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -293,7 +293,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, */ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - memset(info, 0, sizeof(struct tcp_dctcp_info)); + memset(&info->dctcp, 0, sizeof(info->dctcp)); if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { info->dctcp.dctcp_enabled = 1; info->dctcp.dctcp_ce_state = (u16) ca->ce_state; @@ -303,7 +303,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, } *attr = INET_DIAG_DCTCPINFO; - return sizeof(*info); + return sizeof(info->dctcp); } return 0; } -- cgit v1.2.3 From 0cb43965d42a21a7af41f88f1021b478dc102425 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:26 -0700 Subject: RDS: split out connection specific state from rds_connection to rds_conn_path In preparation for multipath RDS, split the rds_connection structure into a base structure, and a per-path struct rds_conn_path. The base structure tracks information and locks common to all paths. The workqs for send/recv/shutdown etc are tracked per rds_conn_path. Thus the workq callbacks now work with rds_conn_path. This commit allows for one rds_conn_path per rds_connection, and will be extended into multiple conn_paths in subsequent commits. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/cong.c | 3 +- net/rds/connection.c | 18 +++++-- net/rds/ib.c | 1 + net/rds/ib_cm.c | 1 + net/rds/ib_rdma.c | 1 + net/rds/ib_recv.c | 1 + net/rds/ib_send.c | 1 + net/rds/loop.c | 1 + net/rds/rdma_transport.c | 1 + net/rds/rds.h | 122 ++++++++++++++++++++++++++++++---------------- net/rds/rds_single_path.h | 30 ++++++++++++ net/rds/recv.c | 1 + net/rds/send.c | 1 + net/rds/tcp.c | 1 + net/rds/tcp_connect.c | 4 +- net/rds/tcp_listen.c | 11 +++-- net/rds/tcp_recv.c | 1 + net/rds/tcp_send.c | 1 + net/rds/threads.c | 92 +++++++++++++++++++--------------- 19 files changed, 199 insertions(+), 93 deletions(-) create mode 100644 net/rds/rds_single_path.h (limited to 'net') diff --git a/net/rds/cong.c b/net/rds/cong.c index 6641bcf7c185..8398fee7c866 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -235,7 +235,8 @@ void rds_cong_queue_updates(struct rds_cong_map *map) * therefore trigger warnings. * Defer the xmit to rds_send_worker() instead. */ - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(rds_wq, + &conn->c_path[0].cp_send_w, 0); } } diff --git a/net/rds/connection.c b/net/rds/connection.c index e3b118cae81d..6fa2074044b9 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -36,6 +36,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "loop.h" @@ -155,6 +156,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, conn->c_faddr = faddr; spin_lock_init(&conn->c_lock); conn->c_next_tx_seq = 1; + conn->c_path[0].cp_conn = conn; rds_conn_net_set(conn, net); init_waitqueue_head(&conn->c_waitq); @@ -197,7 +199,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, atomic_set(&conn->c_state, RDS_CONN_DOWN); conn->c_send_gen = 0; - conn->c_outgoing = (is_outgoing ? 1 : 0); + conn->c_path[0].cp_outgoing = (is_outgoing ? 1 : 0); conn->c_reconnect_jiffies = 0; INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); @@ -320,8 +322,8 @@ void rds_conn_shutdown(struct rds_connection *conn) if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); if (conn->c_trans->t_type != RDS_TRANS_TCP || - conn->c_outgoing == 1) - rds_queue_reconnect(conn); + conn->c_path[0].cp_outgoing == 1) + rds_queue_reconnect(&conn->c_path[0]); } else { rcu_read_unlock(); } @@ -553,10 +555,16 @@ void rds_conn_exit(void) /* * Force a disconnect */ +void rds_conn_path_drop(struct rds_conn_path *cp) +{ + atomic_set(&cp->cp_state, RDS_CONN_ERROR); + queue_work(rds_wq, &cp->cp_down_w); +} +EXPORT_SYMBOL_GPL(rds_conn_path_drop); + void rds_conn_drop(struct rds_connection *conn) { - atomic_set(&conn->c_state, RDS_CONN_ERROR); - queue_work(rds_wq, &conn->c_down_w); + rds_conn_path_drop(&conn->c_path[0]); } EXPORT_SYMBOL_GPL(rds_conn_drop); diff --git a/net/rds/ib.c b/net/rds/ib.c index b5342fddaf98..44946a681a8c 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -40,6 +40,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "ib.h" #include "ib_mr.h" diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 310cabce2311..4de5a35f5c40 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -36,6 +36,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "ib.h" diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a0f21b65a83c..977f69886c00 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -35,6 +35,7 @@ #include #include +#include "rds_single_path.h" #include "ib_mr.h" struct workqueue_struct *rds_ib_mr_wq; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index abc8cc805e8d..4ea8cb17cc7a 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -36,6 +36,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "ib.h" diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index f27d2c82b036..6e4110aa5135 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -36,6 +36,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "ib.h" diff --git a/net/rds/loop.c b/net/rds/loop.c index 6b12b68541ae..268f07faaa1a 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -34,6 +34,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "loop.h" diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 7220bebcf558..345f09059e9f 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -33,6 +33,7 @@ #include #include +#include "rds_single_path.h" #include "rdma_transport.h" #include "ib.h" diff --git a/net/rds/rds.h b/net/rds/rds.h index 387df5f32e49..ca31a07f70f5 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -84,56 +84,69 @@ enum { #define RDS_IN_XMIT 2 #define RDS_RECV_REFILL 3 +/* Max number of multipaths per RDS connection. Must be a power of 2 */ +#define RDS_MPATH_WORKERS 1 + +/* Per mpath connection state */ +struct rds_conn_path { + struct rds_connection *cp_conn; + struct rds_message *cp_xmit_rm; + unsigned long cp_xmit_sg; + unsigned int cp_xmit_hdr_off; + unsigned int cp_xmit_data_off; + unsigned int cp_xmit_atomic_sent; + unsigned int cp_xmit_rdma_sent; + unsigned int cp_xmit_data_sent; + + spinlock_t cp_lock; /* protect msg queues */ + u64 cp_next_tx_seq; + struct list_head cp_send_queue; + struct list_head cp_retrans; + + u64 cp_next_rx_seq; + + void *cp_transport_data; + + atomic_t cp_state; + unsigned long cp_send_gen; + unsigned long cp_flags; + unsigned long cp_reconnect_jiffies; + struct delayed_work cp_send_w; + struct delayed_work cp_recv_w; + struct delayed_work cp_conn_w; + struct work_struct cp_down_w; + struct mutex cp_cm_lock; /* protect cp_state & cm */ + wait_queue_head_t cp_waitq; + + unsigned int cp_unacked_packets; + unsigned int cp_unacked_bytes; + unsigned int cp_outgoing:1, + cp_pad_to_32:31; + unsigned int cp_index; +}; + +/* One rds_connection per RDS address pair */ struct rds_connection { struct hlist_node c_hash_node; __be32 c_laddr; __be32 c_faddr; unsigned int c_loopback:1, - c_outgoing:1, - c_pad_to_32:30; + c_pad_to_32:31; + int c_npaths; struct rds_connection *c_passive; + struct rds_transport *c_trans; struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; - struct rds_message *c_xmit_rm; - unsigned long c_xmit_sg; - unsigned int c_xmit_hdr_off; - unsigned int c_xmit_data_off; - unsigned int c_xmit_atomic_sent; - unsigned int c_xmit_rdma_sent; - unsigned int c_xmit_data_sent; - - spinlock_t c_lock; /* protect msg queues */ - u64 c_next_tx_seq; - struct list_head c_send_queue; - struct list_head c_retrans; - - u64 c_next_rx_seq; - - struct rds_transport *c_trans; - void *c_transport_data; - - atomic_t c_state; - unsigned long c_send_gen; - unsigned long c_flags; - unsigned long c_reconnect_jiffies; - struct delayed_work c_send_w; - struct delayed_work c_recv_w; - struct delayed_work c_conn_w; - struct work_struct c_down_w; - struct mutex c_cm_lock; /* protect conn state & cm */ - wait_queue_head_t c_waitq; + /* Protocol version */ + unsigned int c_version; + possible_net_t c_net; struct list_head c_map_item; unsigned long c_map_queued; - unsigned int c_unacked_packets; - unsigned int c_unacked_bytes; - - /* Protocol version */ - unsigned int c_version; - possible_net_t c_net; + struct rds_conn_path c_path[RDS_MPATH_WORKERS]; }; static inline @@ -639,6 +652,7 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net, void rds_conn_shutdown(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); +void rds_conn_path_drop(struct rds_conn_path *cpath); void rds_conn_connect_if_down(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -650,28 +664,52 @@ void __rds_conn_error(struct rds_connection *conn, const char *, ...); #define rds_conn_error(conn, fmt...) \ __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) +static inline int +rds_conn_path_transition(struct rds_conn_path *cp, int old, int new) +{ + return atomic_cmpxchg(&cp->cp_state, old, new) == old; +} + static inline int rds_conn_transition(struct rds_connection *conn, int old, int new) { - return atomic_cmpxchg(&conn->c_state, old, new) == old; + return rds_conn_path_transition(&conn->c_path[0], old, new); +} + +static inline int +rds_conn_path_state(struct rds_conn_path *cp) +{ + return atomic_read(&cp->cp_state); } static inline int rds_conn_state(struct rds_connection *conn) { - return atomic_read(&conn->c_state); + return rds_conn_path_state(&conn->c_path[0]); +} + +static inline int +rds_conn_path_up(struct rds_conn_path *cp) +{ + return atomic_read(&cp->cp_state) == RDS_CONN_UP; } static inline int rds_conn_up(struct rds_connection *conn) { - return atomic_read(&conn->c_state) == RDS_CONN_UP; + return rds_conn_path_up(&conn->c_path[0]); +} + +static inline int +rds_conn_path_connecting(struct rds_conn_path *cp) +{ + return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING; } static inline int rds_conn_connecting(struct rds_connection *conn) { - return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING; + return rds_conn_path_connecting(&conn->c_path[0]); } /* message.c */ @@ -809,12 +847,12 @@ extern unsigned int rds_sysctl_trace_level; int rds_threads_init(void); void rds_threads_exit(void); extern struct workqueue_struct *rds_wq; -void rds_queue_reconnect(struct rds_connection *conn); +void rds_queue_reconnect(struct rds_conn_path *cp); void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); void rds_recv_worker(struct work_struct *); -void rds_connect_path_complete(struct rds_connection *conn, int curr); +void rds_connect_path_complete(struct rds_conn_path *conn, int curr); void rds_connect_complete(struct rds_connection *conn); /* transport.c */ diff --git a/net/rds/rds_single_path.h b/net/rds/rds_single_path.h new file mode 100644 index 000000000000..e1241af7c1ad --- /dev/null +++ b/net/rds/rds_single_path.h @@ -0,0 +1,30 @@ +#ifndef _RDS_RDS_SINGLE_H +#define _RDS_RDS_SINGLE_H + +#define c_xmit_rm c_path[0].cp_xmit_rm +#define c_xmit_sg c_path[0].cp_xmit_sg +#define c_xmit_hdr_off c_path[0].cp_xmit_hdr_off +#define c_xmit_data_off c_path[0].cp_xmit_data_off +#define c_xmit_atomic_sent c_path[0].cp_xmit_atomic_sent +#define c_xmit_rdma_sent c_path[0].cp_xmit_rdma_sent +#define c_xmit_data_sent c_path[0].cp_xmit_data_sent +#define c_lock c_path[0].cp_lock +#define c_next_tx_seq c_path[0].cp_next_tx_seq +#define c_send_queue c_path[0].cp_send_queue +#define c_retrans c_path[0].cp_retrans +#define c_next_rx_seq c_path[0].cp_next_rx_seq +#define c_transport_data c_path[0].cp_transport_data +#define c_state c_path[0].cp_state +#define c_send_gen c_path[0].cp_send_gen +#define c_flags c_path[0].cp_flags +#define c_reconnect_jiffies c_path[0].cp_reconnect_jiffies +#define c_send_w c_path[0].cp_send_w +#define c_recv_w c_path[0].cp_recv_w +#define c_conn_w c_path[0].cp_conn_w +#define c_down_w c_path[0].cp_down_w +#define c_cm_lock c_path[0].cp_cm_lock +#define c_waitq c_path[0].cp_waitq +#define c_unacked_packets c_path[0].cp_unacked_packets +#define c_unacked_bytes c_path[0].cp_unacked_bytes + +#endif /* _RDS_RDS_SINGLE_H */ diff --git a/net/rds/recv.c b/net/rds/recv.c index 8413f6c99e13..78b5c430324d 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -38,6 +38,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, diff --git a/net/rds/send.c b/net/rds/send.c index b1962f8e30f7..a3b3b35ad57a 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -40,6 +40,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" /* When transmitting messages in rds_send_xmit, we need to emerge from diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 74ee126a6fe6..4bc1c153e93a 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -38,6 +38,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "tcp.h" diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index fba13d0305fb..ba9ec67f4e41 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -34,6 +34,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "tcp.h" @@ -60,7 +61,8 @@ void rds_tcp_state_change(struct sock *sk) case TCP_SYN_RECV: break; case TCP_ESTABLISHED: - rds_connect_path_complete(conn, RDS_CONN_CONNECTING); + rds_connect_path_complete(&conn->c_path[0], + RDS_CONN_CONNECTING); break; case TCP_CLOSE_WAIT: case TCP_CLOSE: diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 686b1d03a558..22d9bb15f731 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -35,6 +35,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "tcp.h" @@ -132,17 +133,19 @@ int rds_tcp_accept_one(struct socket *sock) * c_transport_data. */ if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) || - !conn->c_outgoing) { + !conn->c_path[0].cp_outgoing) { goto rst_nsk; } else { rds_tcp_reset_callbacks(new_sock, conn); - conn->c_outgoing = 0; + conn->c_path[0].cp_outgoing = 0; /* rds_connect_path_complete() marks RDS_CONN_UP */ - rds_connect_path_complete(conn, RDS_CONN_DISCONNECTING); + rds_connect_path_complete(&conn->c_path[0], + RDS_CONN_DISCONNECTING); } } else { rds_tcp_set_callbacks(new_sock, conn); - rds_connect_path_complete(conn, RDS_CONN_CONNECTING); + rds_connect_path_complete(&conn->c_path[0], + RDS_CONN_CONNECTING); } new_sock = NULL; ret = 0; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index c3196f9d070a..3f8fb38996c7 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -34,6 +34,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "tcp.h" diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 22d0f2020a79..2b3414f3c45c 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -34,6 +34,7 @@ #include #include +#include "rds_single_path.h" #include "rds.h" #include "tcp.h" diff --git a/net/rds/threads.c b/net/rds/threads.c index 4a323045719b..6d0979b8dc63 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -71,30 +71,30 @@ struct workqueue_struct *rds_wq; EXPORT_SYMBOL_GPL(rds_wq); -void rds_connect_path_complete(struct rds_connection *conn, int curr) +void rds_connect_path_complete(struct rds_conn_path *cp, int curr) { - if (!rds_conn_transition(conn, curr, RDS_CONN_UP)) { + if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) { printk(KERN_WARNING "%s: Cannot transition to state UP, " "current state is %d\n", __func__, - atomic_read(&conn->c_state)); - rds_conn_drop(conn); + atomic_read(&cp->cp_state)); + rds_conn_path_drop(cp); return; } rdsdebug("conn %p for %pI4 to %pI4 complete\n", - conn, &conn->c_laddr, &conn->c_faddr); + cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); - conn->c_reconnect_jiffies = 0; - set_bit(0, &conn->c_map_queued); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + cp->cp_reconnect_jiffies = 0; + set_bit(0, &cp->cp_conn->c_map_queued); + queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); } EXPORT_SYMBOL_GPL(rds_connect_path_complete); void rds_connect_complete(struct rds_connection *conn) { - rds_connect_path_complete(conn, RDS_CONN_CONNECTING); + rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING); } EXPORT_SYMBOL_GPL(rds_connect_complete); @@ -116,46 +116,52 @@ EXPORT_SYMBOL_GPL(rds_connect_complete); * We should *always* start with a random backoff; otherwise a broken connection * will always take several iterations to be re-established. */ -void rds_queue_reconnect(struct rds_connection *conn) +void rds_queue_reconnect(struct rds_conn_path *cp) { unsigned long rand; + struct rds_connection *conn = cp->cp_conn; rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", conn, &conn->c_laddr, &conn->c_faddr, - conn->c_reconnect_jiffies); + cp->cp_reconnect_jiffies); - set_bit(RDS_RECONNECT_PENDING, &conn->c_flags); - if (conn->c_reconnect_jiffies == 0) { - conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); + if (cp->cp_reconnect_jiffies == 0) { + cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; + queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); return; } get_random_bytes(&rand, sizeof(rand)); rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", - rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, + rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, conn, &conn->c_laddr, &conn->c_faddr); - queue_delayed_work(rds_wq, &conn->c_conn_w, - rand % conn->c_reconnect_jiffies); + queue_delayed_work(rds_wq, &cp->cp_conn_w, + rand % cp->cp_reconnect_jiffies); - conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, + cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2, rds_sysctl_reconnect_max_jiffies); } void rds_connect_worker(struct work_struct *work) { - struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work); + struct rds_conn_path *cp = container_of(work, + struct rds_conn_path, + cp_conn_w.work); + struct rds_connection *conn = cp->cp_conn; int ret; - clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags); - if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); + if (rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { ret = conn->c_trans->conn_connect(conn); rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", conn, &conn->c_laddr, &conn->c_faddr, ret); if (ret) { - if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) - rds_queue_reconnect(conn); + if (rds_conn_path_transition(cp, + RDS_CONN_CONNECTING, + RDS_CONN_DOWN)) + rds_queue_reconnect(cp); else rds_conn_error(conn, "RDS: connect failed\n"); } @@ -164,22 +170,24 @@ void rds_connect_worker(struct work_struct *work) void rds_send_worker(struct work_struct *work) { - struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); + struct rds_conn_path *cp = container_of(work, + struct rds_conn_path, + cp_send_w.work); int ret; - if (rds_conn_state(conn) == RDS_CONN_UP) { - clear_bit(RDS_LL_SEND_FULL, &conn->c_flags); - ret = rds_send_xmit(conn); + if (rds_conn_path_state(cp) == RDS_CONN_UP) { + clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags); + ret = rds_send_xmit(cp->cp_conn); cond_resched(); - rdsdebug("conn %p ret %d\n", conn, ret); + rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); switch (ret) { case -EAGAIN: rds_stats_inc(s_send_immediate_retry); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(rds_wq, &cp->cp_send_w, 0); break; case -ENOMEM: rds_stats_inc(s_send_delayed_retry); - queue_delayed_work(rds_wq, &conn->c_send_w, 2); + queue_delayed_work(rds_wq, &cp->cp_send_w, 2); default: break; } @@ -188,20 +196,22 @@ void rds_send_worker(struct work_struct *work) void rds_recv_worker(struct work_struct *work) { - struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work); + struct rds_conn_path *cp = container_of(work, + struct rds_conn_path, + cp_recv_w.work); int ret; - if (rds_conn_state(conn) == RDS_CONN_UP) { - ret = conn->c_trans->recv(conn); - rdsdebug("conn %p ret %d\n", conn, ret); + if (rds_conn_path_state(cp) == RDS_CONN_UP) { + ret = cp->cp_conn->c_trans->recv(cp->cp_conn); + rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); switch (ret) { case -EAGAIN: rds_stats_inc(s_recv_immediate_retry); - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); break; case -ENOMEM: rds_stats_inc(s_recv_delayed_retry); - queue_delayed_work(rds_wq, &conn->c_recv_w, 2); + queue_delayed_work(rds_wq, &cp->cp_recv_w, 2); default: break; } @@ -210,9 +220,11 @@ void rds_recv_worker(struct work_struct *work) void rds_shutdown_worker(struct work_struct *work) { - struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); + struct rds_conn_path *cp = container_of(work, + struct rds_conn_path, + cp_down_w); - rds_conn_shutdown(conn); + rds_conn_shutdown(cp->cp_conn); } void rds_threads_exit(void) -- cgit v1.2.3 From 7e8f4413d7861efcb332ebce8d9b000a17eaa0e5 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:27 -0700 Subject: RDS: add t_mp_capable bit to be set by MP capable transports The t_mp_capable bit will be used in the core rds module to support multipathing logic when the transport supports it. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/rds.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/rds.h b/net/rds/rds.h index ca31a07f70f5..28f001cbc893 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -446,7 +446,8 @@ struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; struct module *t_owner; - unsigned int t_prefer_loopback:1; + unsigned int t_prefer_loopback:1, + t_mp_capable:1; unsigned int t_type; int (*laddr_check)(struct net *net, __be32 addr); @@ -673,6 +674,7 @@ rds_conn_path_transition(struct rds_conn_path *cp, int old, int new) static inline int rds_conn_transition(struct rds_connection *conn, int old, int new) { + WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_transition(&conn->c_path[0], old, new); } @@ -685,6 +687,7 @@ rds_conn_path_state(struct rds_conn_path *cp) static inline int rds_conn_state(struct rds_connection *conn) { + WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_state(&conn->c_path[0]); } @@ -697,6 +700,7 @@ rds_conn_path_up(struct rds_conn_path *cp) static inline int rds_conn_up(struct rds_connection *conn) { + WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_up(&conn->c_path[0]); } @@ -709,6 +713,7 @@ rds_conn_path_connecting(struct rds_conn_path *cp) static inline int rds_conn_connecting(struct rds_connection *conn) { + WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_connecting(&conn->c_path[0]); } -- cgit v1.2.3 From ef9e62c2e5087cb9bc713e3d9776336e1bb40df1 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:28 -0700 Subject: RDS: recv path gets the conn_path from rds_incoming for MP capable transports Transports that are t_mp_capable should set the rds_conn_path on which the datagram was recived in the ->i_conn_path field of struct rds_incoming. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/rds.h | 1 + net/rds/recv.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/rds.h b/net/rds/rds.h index 28f001cbc893..7c85b2d792b6 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -231,6 +231,7 @@ struct rds_incoming { atomic_t i_refcount; struct list_head i_item; struct rds_connection *i_conn; + struct rds_conn_path *i_conn_path; struct rds_header i_hdr; unsigned long i_rx_jiffies; __be32 i_saddr; diff --git a/net/rds/recv.c b/net/rds/recv.c index 78b5c430324d..e36652cfbd35 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -38,7 +38,6 @@ #include #include -#include "rds_single_path.h" #include "rds.h" void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, @@ -165,13 +164,18 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, struct rds_sock *rs = NULL; struct sock *sk; unsigned long flags; + struct rds_conn_path *cp; inc->i_conn = conn; inc->i_rx_jiffies = jiffies; + if (conn->c_trans->t_mp_capable) + cp = inc->i_conn_path; + else + cp = &conn->c_path[0]; rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " "flags 0x%x rx_jiffies %lu\n", conn, - (unsigned long long)conn->c_next_rx_seq, + (unsigned long long)cp->cp_next_rx_seq, inc, (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), be32_to_cpu(inc->i_hdr.h_len), @@ -200,12 +204,12 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, * XXX we could spend more on the wire to get more robust failure * detection, arguably worth it to avoid data corruption. */ - if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq && + if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { rds_stats_inc(s_recv_drop_old_seq); goto out; } - conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; + cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { rds_stats_inc(s_recv_ping); -- cgit v1.2.3 From 5e833e025d9dc3f61c04e74936a14419efb6a032 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:29 -0700 Subject: RDS: rds_inc_path_init() helper function for MP capable transports t_mp_capable transports can use rds_inc_path_init to initialize all fields in struct rds_incoming, including the i_conn_path. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/rds.h | 2 ++ net/rds/recv.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'net') diff --git a/net/rds/rds.h b/net/rds/rds.h index 7c85b2d792b6..c3b14ccd7037 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -764,6 +764,8 @@ void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr); +void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, + __be32 saddr); void rds_inc_put(struct rds_incoming *inc); void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, struct rds_incoming *inc, gfp_t gfp); diff --git a/net/rds/recv.c b/net/rds/recv.c index e36652cfbd35..6d7bd63121fc 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -53,6 +53,20 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, } EXPORT_SYMBOL_GPL(rds_inc_init); +void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, + __be32 saddr) +{ + atomic_set(&inc->i_refcount, 1); + INIT_LIST_HEAD(&inc->i_item); + inc->i_conn = cp->cp_conn; + inc->i_conn_path = cp; + inc->i_saddr = saddr; + inc->i_rdma_cookie = 0; + inc->i_rx_tstamp.tv_sec = 0; + inc->i_rx_tstamp.tv_usec = 0; +} +EXPORT_SYMBOL_GPL(rds_inc_path_init); + static void rds_inc_addref(struct rds_incoming *inc) { rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); -- cgit v1.2.3 From 4e9b551c14560399776c05f4234650c6d3729458 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:30 -0700 Subject: RDS: Add rds_send_path_reset() rds_send_path_reset() is the path specific version of rds_send_reset() intended for MP capable callers. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/send.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index a3b3b35ad57a..bfb3e0530213 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -63,14 +63,14 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status); * Reset the send state. Callers must ensure that this doesn't race with * rds_send_xmit(). */ -void rds_send_reset(struct rds_connection *conn) +static void rds_send_path_reset(struct rds_conn_path *cp) { struct rds_message *rm, *tmp; unsigned long flags; - if (conn->c_xmit_rm) { - rm = conn->c_xmit_rm; - conn->c_xmit_rm = NULL; + if (cp->cp_xmit_rm) { + rm = cp->cp_xmit_rm; + cp->cp_xmit_rm = NULL; /* Tell the user the RDMA op is no longer mapped by the * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's @@ -79,26 +79,31 @@ void rds_send_reset(struct rds_connection *conn) rds_message_put(rm); } - conn->c_xmit_sg = 0; - conn->c_xmit_hdr_off = 0; - conn->c_xmit_data_off = 0; - conn->c_xmit_atomic_sent = 0; - conn->c_xmit_rdma_sent = 0; - conn->c_xmit_data_sent = 0; + cp->cp_xmit_sg = 0; + cp->cp_xmit_hdr_off = 0; + cp->cp_xmit_data_off = 0; + cp->cp_xmit_atomic_sent = 0; + cp->cp_xmit_rdma_sent = 0; + cp->cp_xmit_data_sent = 0; - conn->c_map_queued = 0; + cp->cp_conn->c_map_queued = 0; - conn->c_unacked_packets = rds_sysctl_max_unacked_packets; - conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + cp->cp_unacked_packets = rds_sysctl_max_unacked_packets; + cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes; /* Mark messages as retransmissions, and move them to the send q */ - spin_lock_irqsave(&conn->c_lock, flags); - list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + spin_lock_irqsave(&cp->cp_lock, flags); + list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); } - list_splice_init(&conn->c_retrans, &conn->c_send_queue); - spin_unlock_irqrestore(&conn->c_lock, flags); + list_splice_init(&cp->cp_retrans, &cp->cp_send_queue); + spin_unlock_irqrestore(&cp->cp_lock, flags); +} + +void rds_send_reset(struct rds_connection *conn) +{ + rds_send_path_reset(&conn->c_path[0]); } EXPORT_SYMBOL_GPL(rds_send_reset); -- cgit v1.2.3 From 5c3d274c75fbcee8e1c919acf25c7feb19a31492 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:31 -0700 Subject: RDS: Add rds_send_path_drop_acked() rds_send_path_drop_acked() is the path-specific version of rds_send_drop_acked() to be invoked by MP capable callers. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/rds.h | 2 ++ net/rds/send.c | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rds/rds.h b/net/rds/rds.h index c3b14ccd7037..d94aa36cab93 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -786,6 +786,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); +void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, + is_acked_func is_acked); int rds_send_pong(struct rds_connection *conn, __be16 dport); struct rds_message *rds_send_get_message(struct rds_connection *, struct rm_rdma_op *); diff --git a/net/rds/send.c b/net/rds/send.c index bfb3e0530213..3f6a96cb3b94 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -691,16 +691,16 @@ unlock_and_drop: * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked * checks the RDS_MSG_HAS_ACK_SEQ bit. */ -void rds_send_drop_acked(struct rds_connection *conn, u64 ack, - is_acked_func is_acked) +void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, + is_acked_func is_acked) { struct rds_message *rm, *tmp; unsigned long flags; LIST_HEAD(list); - spin_lock_irqsave(&conn->c_lock, flags); + spin_lock_irqsave(&cp->cp_lock, flags); - list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { if (!rds_send_is_acked(rm, ack, is_acked)) break; @@ -712,11 +712,19 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, if (!list_empty(&list)) smp_mb__after_atomic(); - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_unlock_irqrestore(&cp->cp_lock, flags); /* now remove the messages from the sock list as needed */ rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); } +EXPORT_SYMBOL_GPL(rds_send_path_drop_acked); + +void rds_send_drop_acked(struct rds_connection *conn, u64 ack, + is_acked_func is_acked) +{ + WARN_ON(conn->c_trans->t_mp_capable); + rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked); +} EXPORT_SYMBOL_GPL(rds_send_drop_acked); void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) -- cgit v1.2.3 From 7d885d0fc69abe22382fae5dddd84684333ab29b Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:32 -0700 Subject: RDS: Remove stale function rds_send_get_message() The only caller of rds_send_get_message() was rds_iw_send_cq_comp_handler() which was removed as part of commit dcdede0406d3 ("RDS: Drop stale iWARP RDMA transport"), so remove rds_send_get_message() for the same reason. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/rds.h | 2 -- net/rds/send.c | 36 ------------------------------------ 2 files changed, 38 deletions(-) (limited to 'net') diff --git a/net/rds/rds.h b/net/rds/rds.h index d94aa36cab93..2cffd37a550f 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -789,8 +789,6 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, is_acked_func is_acked); int rds_send_pong(struct rds_connection *conn, __be16 dport); -struct rds_message *rds_send_get_message(struct rds_connection *, - struct rm_rdma_op *); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); diff --git a/net/rds/send.c b/net/rds/send.c index 3f6a96cb3b94..3fb280b75160 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -565,42 +565,6 @@ __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) /* No need to wake the app - caller does this */ } -/* - * This is called from the IB send completion when we detect - * a RDMA operation that failed with remote access error. - * So speed is not an issue here. - */ -struct rds_message *rds_send_get_message(struct rds_connection *conn, - struct rm_rdma_op *op) -{ - struct rds_message *rm, *tmp, *found = NULL; - unsigned long flags; - - spin_lock_irqsave(&conn->c_lock, flags); - - list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (&rm->rdma == op) { - atomic_inc(&rm->m_refcount); - found = rm; - goto out; - } - } - - list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (&rm->rdma == op) { - atomic_inc(&rm->m_refcount); - found = rm; - break; - } - } - -out: - spin_unlock_irqrestore(&conn->c_lock, flags); - - return found; -} -EXPORT_SYMBOL_GPL(rds_send_get_message); - /* * This removes messages from the socket's list if they're on it. The list * argument must be private to the caller, we must be able to modify it -- cgit v1.2.3 From 780a6d9e16d1827eb97c2497d7814fe34d280c15 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:33 -0700 Subject: RDS: Make rds_send_queue_rm() rds_conn_path aware Pass the rds_conn_path to rds_send_queue_rm, and use it to initialize the i_conn_path field in struct rds_incoming. This commit also makes rds_send_queue_rm() MP capable, because it now takes locks specific to the rds_conn_path passed in, instead of defaulting to the c_path[0] based defines from rds_single_path.h Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/send.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 3fb280b75160..076ee413d21c 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -787,6 +787,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) * message from the flow with RDS_CANCEL_SENT_TO. */ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, + struct rds_conn_path *cp, struct rds_message *rm, __be16 sport, __be16 dport, int *queued) { @@ -830,13 +831,14 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, trying to minimize the time we hold c_lock */ rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); rm->m_inc.i_conn = conn; + rm->m_inc.i_conn_path = cp; rds_message_addref(rm); - spin_lock(&conn->c_lock); - rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++); - list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + spin_lock(&cp->cp_lock); + rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++); + list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); - spin_unlock(&conn->c_lock); + spin_unlock(&cp->cp_lock); rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", rm, len, rs, rs->rs_snd_bytes, @@ -968,6 +970,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) int queued = 0, allocated_mr = 0; int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); + struct rds_conn_path *cpath; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ @@ -1074,7 +1077,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } - while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, + cpath = &conn->c_path[0]; + + while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued)) { rds_stats_inc(s_send_queue_full); @@ -1084,7 +1089,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) } timeo = wait_event_interruptible_timeout(*sk_sleep(sk), - rds_send_queue_rm(rs, conn, rm, + rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued), -- cgit v1.2.3 From 1f9ecd7eacfd9ee52a114b87292bfe885aafdb1f Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:34 -0700 Subject: RDS: Pass rds_conn_path to rds_send_xmit() Pass a struct rds_conn_path to rds_send_xmit so that MP capable transports can transmit packets on something other than c_path[0]. The eventual goal for MP capable transports is to hash the rds socket to a path based on the bound local address/port, and use this path as the argument to rds_send_xmit() Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 2 +- net/rds/rds.h | 4 +- net/rds/send.c | 149 ++++++++++++++++++++++++++++++------------------------ net/rds/threads.c | 2 +- 4 files changed, 87 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 4de5a35f5c40..334287602b78 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -274,7 +274,7 @@ static void rds_ib_tasklet_fn_send(unsigned long data) if (rds_conn_up(conn) && (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) || test_bit(0, &conn->c_map_queued))) - rds_send_xmit(ic->conn); + rds_send_xmit(&ic->conn->c_path[0]); } static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq, diff --git a/net/rds/rds.h b/net/rds/rds.h index 2cffd37a550f..b6072eb05fb6 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -457,7 +457,9 @@ struct rds_transport { int (*conn_connect)(struct rds_connection *conn); void (*conn_shutdown)(struct rds_connection *conn); void (*xmit_prepare)(struct rds_connection *conn); + void (*xmit_path_prepare)(struct rds_conn_path *cp); void (*xmit_complete)(struct rds_connection *conn); + void (*xmit_path_complete)(struct rds_conn_path *cp); int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); @@ -780,7 +782,7 @@ void rds_inc_info_copy(struct rds_incoming *inc, /* send.c */ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); void rds_send_reset(struct rds_connection *conn); -int rds_send_xmit(struct rds_connection *conn); +int rds_send_xmit(struct rds_conn_path *cp); struct sockaddr_in; void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); diff --git a/net/rds/send.c b/net/rds/send.c index 076ee413d21c..966311d135af 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -107,14 +107,14 @@ void rds_send_reset(struct rds_connection *conn) } EXPORT_SYMBOL_GPL(rds_send_reset); -static int acquire_in_xmit(struct rds_connection *conn) +static int acquire_in_xmit(struct rds_conn_path *cp) { - return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0; + return test_and_set_bit(RDS_IN_XMIT, &cp->cp_flags) == 0; } -static void release_in_xmit(struct rds_connection *conn) +static void release_in_xmit(struct rds_conn_path *cp) { - clear_bit(RDS_IN_XMIT, &conn->c_flags); + clear_bit(RDS_IN_XMIT, &cp->cp_flags); smp_mb__after_atomic(); /* * We don't use wait_on_bit()/wake_up_bit() because our waking is in a @@ -122,8 +122,8 @@ static void release_in_xmit(struct rds_connection *conn) * the system-wide hashed waitqueue buckets in the fast path only to * almost never find waiters. */ - if (waitqueue_active(&conn->c_waitq)) - wake_up_all(&conn->c_waitq); + if (waitqueue_active(&cp->cp_waitq)) + wake_up_all(&cp->cp_waitq); } /* @@ -140,8 +140,9 @@ static void release_in_xmit(struct rds_connection *conn) * - small message latency is higher behind queued large messages * - large message latency isn't starved by intervening small sends */ -int rds_send_xmit(struct rds_connection *conn) +int rds_send_xmit(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; struct rds_message *rm; unsigned long flags; unsigned int tmp; @@ -161,7 +162,7 @@ restart: * avoids blocking the caller and trading per-connection data between * caches per message. */ - if (!acquire_in_xmit(conn)) { + if (!acquire_in_xmit(cp)) { rds_stats_inc(s_send_lock_contention); ret = -ENOMEM; goto out; @@ -175,21 +176,25 @@ restart: * The acquire_in_xmit() check above ensures that only one * caller can increment c_send_gen at any time. */ - conn->c_send_gen++; - send_gen = conn->c_send_gen; + cp->cp_send_gen++; + send_gen = cp->cp_send_gen; /* * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, * we do the opposite to avoid races. */ - if (!rds_conn_up(conn)) { - release_in_xmit(conn); + if (!rds_conn_path_up(cp)) { + release_in_xmit(cp); ret = 0; goto out; } - if (conn->c_trans->xmit_prepare) + if (conn->c_trans->t_mp_capable) { + if (conn->c_trans->xmit_path_prepare) + conn->c_trans->xmit_path_prepare(cp); + } else if (conn->c_trans->xmit_prepare) { conn->c_trans->xmit_prepare(conn); + } /* * spin trying to push headers and data down the connection until @@ -197,7 +202,7 @@ restart: */ while (1) { - rm = conn->c_xmit_rm; + rm = cp->cp_xmit_rm; /* * If between sending messages, we can send a pending congestion @@ -210,14 +215,16 @@ restart: break; } rm->data.op_active = 1; + rm->m_inc.i_conn_path = cp; + rm->m_inc.i_conn = cp->cp_conn; - conn->c_xmit_rm = rm; + cp->cp_xmit_rm = rm; } /* * If not already working on one, grab the next message. * - * c_xmit_rm holds a ref while we're sending this message down + * cp_xmit_rm holds a ref while we're sending this message down * the connction. We can use this ref while holding the * send_sem.. rds_send_reset() is serialized with it. */ @@ -234,10 +241,10 @@ restart: if (batch_count >= send_batch_count) goto over_batch; - spin_lock_irqsave(&conn->c_lock, flags); + spin_lock_irqsave(&cp->cp_lock, flags); - if (!list_empty(&conn->c_send_queue)) { - rm = list_entry(conn->c_send_queue.next, + if (!list_empty(&cp->cp_send_queue)) { + rm = list_entry(cp->cp_send_queue.next, struct rds_message, m_conn_item); rds_message_addref(rm); @@ -246,10 +253,11 @@ restart: * Move the message from the send queue to the retransmit * list right away. */ - list_move_tail(&rm->m_conn_item, &conn->c_retrans); + list_move_tail(&rm->m_conn_item, + &cp->cp_retrans); } - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_unlock_irqrestore(&cp->cp_lock, flags); if (!rm) break; @@ -263,32 +271,34 @@ restart: */ if (rm->rdma.op_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { - spin_lock_irqsave(&conn->c_lock, flags); + spin_lock_irqsave(&cp->cp_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_unlock_irqrestore(&cp->cp_lock, flags); continue; } /* Require an ACK every once in a while */ len = ntohl(rm->m_inc.i_hdr.h_len); - if (conn->c_unacked_packets == 0 || - conn->c_unacked_bytes < len) { + if (cp->cp_unacked_packets == 0 || + cp->cp_unacked_bytes < len) { __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); - conn->c_unacked_packets = rds_sysctl_max_unacked_packets; - conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + cp->cp_unacked_packets = + rds_sysctl_max_unacked_packets; + cp->cp_unacked_bytes = + rds_sysctl_max_unacked_bytes; rds_stats_inc(s_send_ack_required); } else { - conn->c_unacked_bytes -= len; - conn->c_unacked_packets--; + cp->cp_unacked_bytes -= len; + cp->cp_unacked_packets--; } - conn->c_xmit_rm = rm; + cp->cp_xmit_rm = rm; } /* The transport either sends the whole rdma or none of it */ - if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { + if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) { rm->m_final_op = &rm->rdma; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue @@ -300,11 +310,11 @@ restart: wake_up_interruptible(&rm->m_flush_wait); break; } - conn->c_xmit_rdma_sent = 1; + cp->cp_xmit_rdma_sent = 1; } - if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { + if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) { rm->m_final_op = &rm->atomic; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue @@ -316,7 +326,7 @@ restart: wake_up_interruptible(&rm->m_flush_wait); break; } - conn->c_xmit_atomic_sent = 1; + cp->cp_xmit_atomic_sent = 1; } @@ -342,41 +352,42 @@ restart: rm->data.op_active = 0; } - if (rm->data.op_active && !conn->c_xmit_data_sent) { + if (rm->data.op_active && !cp->cp_xmit_data_sent) { rm->m_final_op = &rm->data; + ret = conn->c_trans->xmit(conn, rm, - conn->c_xmit_hdr_off, - conn->c_xmit_sg, - conn->c_xmit_data_off); + cp->cp_xmit_hdr_off, + cp->cp_xmit_sg, + cp->cp_xmit_data_off); if (ret <= 0) break; - if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) { + if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) { tmp = min_t(int, ret, sizeof(struct rds_header) - - conn->c_xmit_hdr_off); - conn->c_xmit_hdr_off += tmp; + cp->cp_xmit_hdr_off); + cp->cp_xmit_hdr_off += tmp; ret -= tmp; } - sg = &rm->data.op_sg[conn->c_xmit_sg]; + sg = &rm->data.op_sg[cp->cp_xmit_sg]; while (ret) { tmp = min_t(int, ret, sg->length - - conn->c_xmit_data_off); - conn->c_xmit_data_off += tmp; + cp->cp_xmit_data_off); + cp->cp_xmit_data_off += tmp; ret -= tmp; - if (conn->c_xmit_data_off == sg->length) { - conn->c_xmit_data_off = 0; + if (cp->cp_xmit_data_off == sg->length) { + cp->cp_xmit_data_off = 0; sg++; - conn->c_xmit_sg++; - BUG_ON(ret != 0 && - conn->c_xmit_sg == rm->data.op_nents); + cp->cp_xmit_sg++; + BUG_ON(ret != 0 && cp->cp_xmit_sg == + rm->data.op_nents); } } - if (conn->c_xmit_hdr_off == sizeof(struct rds_header) && - (conn->c_xmit_sg == rm->data.op_nents)) - conn->c_xmit_data_sent = 1; + if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) && + (cp->cp_xmit_sg == rm->data.op_nents)) + cp->cp_xmit_data_sent = 1; } /* @@ -384,23 +395,27 @@ restart: * if there is a data op. Thus, if the data is sent (or there was * none), then we're done with the rm. */ - if (!rm->data.op_active || conn->c_xmit_data_sent) { - conn->c_xmit_rm = NULL; - conn->c_xmit_sg = 0; - conn->c_xmit_hdr_off = 0; - conn->c_xmit_data_off = 0; - conn->c_xmit_rdma_sent = 0; - conn->c_xmit_atomic_sent = 0; - conn->c_xmit_data_sent = 0; + if (!rm->data.op_active || cp->cp_xmit_data_sent) { + cp->cp_xmit_rm = NULL; + cp->cp_xmit_sg = 0; + cp->cp_xmit_hdr_off = 0; + cp->cp_xmit_data_off = 0; + cp->cp_xmit_rdma_sent = 0; + cp->cp_xmit_atomic_sent = 0; + cp->cp_xmit_data_sent = 0; rds_message_put(rm); } } over_batch: - if (conn->c_trans->xmit_complete) + if (conn->c_trans->t_mp_capable) { + if (conn->c_trans->xmit_path_complete) + conn->c_trans->xmit_path_complete(cp); + } else if (conn->c_trans->xmit_complete) { conn->c_trans->xmit_complete(conn); - release_in_xmit(conn); + } + release_in_xmit(cp); /* Nuke any messages we decided not to retransmit. */ if (!list_empty(&to_be_dropped)) { @@ -428,12 +443,12 @@ over_batch: if (ret == 0) { smp_mb(); if ((test_bit(0, &conn->c_map_queued) || - !list_empty(&conn->c_send_queue)) && - send_gen == conn->c_send_gen) { + !list_empty(&cp->cp_send_queue)) && + send_gen == cp->cp_send_gen) { rds_stats_inc(s_send_lock_queue_raced); if (batch_count < send_batch_count) goto restart; - queue_delayed_work(rds_wq, &conn->c_send_w, 1); + queue_delayed_work(rds_wq, &cp->cp_send_w, 1); } } out: @@ -1110,9 +1125,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) */ rds_stats_inc(s_send_queued); - ret = rds_send_xmit(conn); + ret = rds_send_xmit(cpath); if (ret == -ENOMEM || ret == -EAGAIN) - queue_delayed_work(rds_wq, &conn->c_send_w, 1); + queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); rds_message_put(rm); return payload_len; diff --git a/net/rds/threads.c b/net/rds/threads.c index 6d0979b8dc63..50d26576dee7 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -177,7 +177,7 @@ void rds_send_worker(struct work_struct *work) if (rds_conn_path_state(cp) == RDS_CONN_UP) { clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags); - ret = rds_send_xmit(cp->cp_conn); + ret = rds_send_xmit(cp); cond_resched(); rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); switch (ret) { -- cgit v1.2.3 From 01ff34ed44a48ed0ae875291b4b6b7dc9ebeea69 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:35 -0700 Subject: RDS: Extract rds_conn_path from i_conn_path in rds_send_drop_to() for MP-capable transports Explicitly set up rds_conn_path, either from i_conn_path (for MP capable transpots) or as c_path[0], and use this in rds_send_drop_to() Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/send.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 966311d135af..9c34fd204639 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -710,6 +710,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; + struct rds_conn_path *cp; unsigned long flags; LIST_HEAD(list); @@ -738,22 +739,26 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) list_for_each_entry(rm, &list, m_sock_item) { conn = rm->m_inc.i_conn; + if (conn->c_trans->t_mp_capable) + cp = rm->m_inc.i_conn_path; + else + cp = &conn->c_path[0]; - spin_lock_irqsave(&conn->c_lock, flags); + spin_lock_irqsave(&cp->cp_lock, flags); /* * Maybe someone else beat us to removing rm from the conn. * If we race with their flag update we'll get the lock and * then really see that the flag has been cleared. */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_unlock_irqrestore(&cp->cp_lock, flags); spin_lock_irqsave(&rm->m_rs_lock, flags); rm->m_rs = NULL; spin_unlock_irqrestore(&rm->m_rs_lock, flags); continue; } list_del_init(&rm->m_conn_item); - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_unlock_irqrestore(&cp->cp_lock, flags); /* * Couldn't grab m_rs_lock in top loop (lock ordering), -- cgit v1.2.3 From 45997e9e2e01d76607d70461414f66f51487bfe5 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:36 -0700 Subject: RDS: Make rds_send_pong() take a rds_conn_path argument This commit allows rds_send_pong() callers to send back the rds pong message on some path other than c_path[0] by passing in a struct rds_conn_path * argument. It also removes the last dependency on the #defines in rds_single.h from send.c Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/rds.h | 2 +- net/rds/recv.c | 2 +- net/rds/send.c | 24 ++++++++++++------------ 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/rds/rds.h b/net/rds/rds.h index b6072eb05fb6..e31515195526 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -790,7 +790,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, is_acked_func is_acked); -int rds_send_pong(struct rds_connection *conn, __be16 dport); +int rds_send_pong(struct rds_conn_path *cp, __be16 dport); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); diff --git a/net/rds/recv.c b/net/rds/recv.c index 6d7bd63121fc..b58f50571782 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -227,7 +227,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { rds_stats_inc(s_recv_ping); - rds_send_pong(conn, inc->i_hdr.h_sport); + rds_send_pong(cp, inc->i_hdr.h_sport); goto out; } diff --git a/net/rds/send.c b/net/rds/send.c index 9c34fd204639..e614513150fe 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -40,7 +40,6 @@ #include #include -#include "rds_single_path.h" #include "rds.h" /* When transmitting messages in rds_send_xmit, we need to emerge from @@ -1153,7 +1152,7 @@ out: * Reply to a ping packet. */ int -rds_send_pong(struct rds_connection *conn, __be16 dport) +rds_send_pong(struct rds_conn_path *cp, __be16 dport) { struct rds_message *rm; unsigned long flags; @@ -1165,31 +1164,32 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) goto out; } - rm->m_daddr = conn->c_faddr; + rm->m_daddr = cp->cp_conn->c_faddr; rm->data.op_active = 1; - rds_conn_connect_if_down(conn); + rds_conn_connect_if_down(cp->cp_conn); - ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); + ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL); if (ret) goto out; - spin_lock_irqsave(&conn->c_lock, flags); - list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + spin_lock_irqsave(&cp->cp_lock, flags); + list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); rds_message_addref(rm); - rm->m_inc.i_conn = conn; + rm->m_inc.i_conn = cp->cp_conn; + rm->m_inc.i_conn_path = cp; rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport, - conn->c_next_tx_seq); - conn->c_next_tx_seq++; - spin_unlock_irqrestore(&conn->c_lock, flags); + cp->cp_next_tx_seq); + cp->cp_next_tx_seq++; + spin_unlock_irqrestore(&cp->cp_lock, flags); rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); /* schedule the send work on rds_wq */ - queue_delayed_work(rds_wq, &conn->c_send_w, 1); + queue_delayed_work(rds_wq, &cp->cp_send_w, 1); rds_message_put(rm); return 0; -- cgit v1.2.3 From 3c0a59001a416ec2a1c46576917732fe5b99336b Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:37 -0700 Subject: RDS: Add rds_conn_path_connect_if_down() for MP-aware callers rds_conn_path_connect_if_down() works on the rds_conn_path that it is passed. Callers who are not t_m_capable may continue calling rds_conn_connect_if_down, which will invoke rds_conn_path_connect_if_down() with the default c_path[0]. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 12 +++++++++--- net/rds/rds.h | 1 + net/rds/send.c | 9 ++++----- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 6fa2074044b9..953a426b25ab 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -572,11 +572,17 @@ EXPORT_SYMBOL_GPL(rds_conn_drop); * If the connection is down, trigger a connect. We may have scheduled a * delayed reconnect however - in this case we should not interfere. */ +void rds_conn_path_connect_if_down(struct rds_conn_path *cp) +{ + if (rds_conn_path_state(cp) == RDS_CONN_DOWN && + !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) + queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); +} + void rds_conn_connect_if_down(struct rds_connection *conn) { - if (rds_conn_state(conn) == RDS_CONN_DOWN && - !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + WARN_ON(conn->c_trans->t_mp_capable); + rds_conn_path_connect_if_down(&conn->c_path[0]); } EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); diff --git a/net/rds/rds.h b/net/rds/rds.h index e31515195526..74fcf5a28723 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -658,6 +658,7 @@ void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); void rds_conn_path_drop(struct rds_conn_path *cpath); void rds_conn_connect_if_down(struct rds_connection *conn); +void rds_conn_path_connect_if_down(struct rds_conn_path *cp); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, diff --git a/net/rds/send.c b/net/rds/send.c index e614513150fe..369bd6690218 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1088,16 +1088,15 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } - rds_conn_connect_if_down(conn); + cpath = &conn->c_path[0]; + + rds_conn_path_connect_if_down(cpath); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); if (ret) { rs->rs_seen_congestion = 1; goto out; } - - cpath = &conn->c_path[0]; - while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued)) { rds_stats_inc(s_send_queue_full); @@ -1167,7 +1166,7 @@ rds_send_pong(struct rds_conn_path *cp, __be16 dport) rm->m_daddr = cp->cp_conn->c_faddr; rm->data.op_active = 1; - rds_conn_connect_if_down(cp->cp_conn); + rds_conn_path_connect_if_down(cp); ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL); if (ret) -- cgit v1.2.3 From 992c9ec5fe1d0c7859e158ee22f293cbee95c6a3 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:38 -0700 Subject: RDS: update rds-info related functions to traverse multiple conn_paths This commit updates the callbacks related to the rds-info command so that they walk through all the rds_conn_path structures and report the requested info. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 109 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 82 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 953a426b25ab..9e0b489aea41 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -400,6 +400,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, unsigned int total = 0; unsigned long flags; size_t i; + int j; len /= sizeof(struct rds_info_message); @@ -408,23 +409,32 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { - if (want_send) - list = &conn->c_send_queue; - else - list = &conn->c_retrans; - - spin_lock_irqsave(&conn->c_lock, flags); - - /* XXX too lazy to maintain counts.. */ - list_for_each_entry(rm, list, m_conn_item) { - total++; - if (total <= len) - rds_inc_info_copy(&rm->m_inc, iter, - conn->c_laddr, - conn->c_faddr, 0); + struct rds_conn_path *cp; + + for (j = 0; j < RDS_MPATH_WORKERS; j++) { + cp = &conn->c_path[j]; + if (want_send) + list = &cp->cp_send_queue; + else + list = &cp->cp_retrans; + + spin_lock_irqsave(&cp->cp_lock, flags); + + /* XXX too lazy to maintain counts.. */ + list_for_each_entry(rm, list, m_conn_item) { + total++; + if (total <= len) + rds_inc_info_copy(&rm->m_inc, + iter, + conn->c_laddr, + conn->c_faddr, + 0); + } + + spin_unlock_irqrestore(&cp->cp_lock, flags); + if (!conn->c_trans->t_mp_capable) + break; } - - spin_unlock_irqrestore(&conn->c_lock, flags); } } rcu_read_unlock(); @@ -486,27 +496,72 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); -static int rds_conn_info_visitor(struct rds_connection *conn, - void *buffer) +void rds_walk_conn_path_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_conn_path *, void *), + size_t item_len) +{ + u64 buffer[(item_len + 7) / 8]; + struct hlist_head *head; + struct rds_connection *conn; + size_t i; + int j; + + rcu_read_lock(); + + lens->nr = 0; + lens->each = item_len; + + for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); + i++, head++) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { + struct rds_conn_path *cp; + + for (j = 0; j < RDS_MPATH_WORKERS; j++) { + cp = &conn->c_path[j]; + + /* XXX no cp_lock usage.. */ + if (!visitor(cp, buffer)) + continue; + if (!conn->c_trans->t_mp_capable) + break; + } + + /* We copy as much as we can fit in the buffer, + * but we count all items so that the caller + * can resize the buffer. + */ + if (len >= item_len) { + rds_info_copy(iter, buffer, item_len); + len -= item_len; + } + lens->nr++; + } + } + rcu_read_unlock(); +} + +static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds_info_connection *cinfo = buffer; - cinfo->next_tx_seq = conn->c_next_tx_seq; - cinfo->next_rx_seq = conn->c_next_rx_seq; - cinfo->laddr = conn->c_laddr; - cinfo->faddr = conn->c_faddr; - strncpy(cinfo->transport, conn->c_trans->t_name, + cinfo->next_tx_seq = cp->cp_next_tx_seq; + cinfo->next_rx_seq = cp->cp_next_rx_seq; + cinfo->laddr = cp->cp_conn->c_laddr; + cinfo->faddr = cp->cp_conn->c_faddr; + strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name, sizeof(cinfo->transport)); cinfo->flags = 0; - rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags), + rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, - atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, + atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo->flags, - atomic_read(&conn->c_state) == RDS_CONN_UP, + atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); return 1; } @@ -515,7 +570,7 @@ static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { - rds_for_each_conn_info(sock, len, iter, lens, + rds_walk_conn_path_info(sock, len, iter, lens, rds_conn_info_visitor, sizeof(struct rds_info_connection)); } -- cgit v1.2.3 From fb1b3dc43dabd4bf7b57b3d63fd2875a499c81f0 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:39 -0700 Subject: RDS: Add rds_conn_path_error() rds_conn_path_error() is the MP-aware analog of rds_conn_error, to be used by multipath-capable callers. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 12 ++++++++++++ net/rds/rds.h | 4 ++++ net/rds/threads.c | 3 ++- 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 9e0b489aea41..57556e615ce2 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -655,3 +655,15 @@ __rds_conn_error(struct rds_connection *conn, const char *fmt, ...) rds_conn_drop(conn); } + +void +__rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); + + rds_conn_path_drop(cp); +} diff --git a/net/rds/rds.h b/net/rds/rds.h index 74fcf5a28723..85f98bd88c1c 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -669,6 +669,10 @@ void __rds_conn_error(struct rds_connection *conn, const char *, ...); #define rds_conn_error(conn, fmt...) \ __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) +void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...); +#define rds_conn_path_error(cp, fmt...) \ + __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt) + static inline int rds_conn_path_transition(struct rds_conn_path *cp, int old, int new) { diff --git a/net/rds/threads.c b/net/rds/threads.c index 50d26576dee7..94cca66ba5d6 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -163,7 +163,8 @@ void rds_connect_worker(struct work_struct *work) RDS_CONN_DOWN)) rds_queue_reconnect(cp); else - rds_conn_error(conn, "RDS: connect failed\n"); + rds_conn_path_error(cp, + "RDS: connect failed\n"); } } } -- cgit v1.2.3 From 1c5113cf796bb730abc1798a3649b61e9e022be6 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:40 -0700 Subject: RDS: Initialize all RDS_MPATH_WORKERS in __rds_conn_create Add a for() loop in __rds_conn_create to initialize all the conn_paths, in preparate for MP capable transports. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 65 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 57556e615ce2..a99ac69f77ac 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -111,6 +111,32 @@ static void rds_conn_reset(struct rds_connection *conn) * reliability guarantees of RDS. */ } +static void __rds_conn_path_init(struct rds_connection *conn, + struct rds_conn_path *cp, bool is_outgoing) +{ + spin_lock_init(&cp->cp_lock); + cp->cp_next_tx_seq = 1; + init_waitqueue_head(&cp->cp_waitq); + INIT_LIST_HEAD(&cp->cp_send_queue); + INIT_LIST_HEAD(&cp->cp_retrans); + + cp->cp_conn = conn; + atomic_set(&cp->cp_state, RDS_CONN_DOWN); + cp->cp_send_gen = 0; + /* cp_outgoing is per-path. So we can only set it here + * for the single-path transports. + */ + if (!conn->c_trans->t_mp_capable) + cp->cp_outgoing = (is_outgoing ? 1 : 0); + cp->cp_reconnect_jiffies = 0; + INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); + INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); + INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker); + INIT_WORK(&cp->cp_down_w, rds_shutdown_worker); + mutex_init(&cp->cp_cm_lock); + cp->cp_flags = 0; +} + /* * There is only every one 'conn' for a given pair of addresses in the * system at a time. They contain messages to be retransmitted and so @@ -154,14 +180,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, INIT_HLIST_NODE(&conn->c_hash_node); conn->c_laddr = laddr; conn->c_faddr = faddr; - spin_lock_init(&conn->c_lock); - conn->c_next_tx_seq = 1; - conn->c_path[0].cp_conn = conn; - rds_conn_net_set(conn, net); - init_waitqueue_head(&conn->c_waitq); - INIT_LIST_HEAD(&conn->c_send_queue); - INIT_LIST_HEAD(&conn->c_retrans); + rds_conn_net_set(conn, net); ret = rds_cong_get_maps(conn); if (ret) { @@ -197,17 +217,6 @@ static struct rds_connection *__rds_conn_create(struct net *net, goto out; } - atomic_set(&conn->c_state, RDS_CONN_DOWN); - conn->c_send_gen = 0; - conn->c_path[0].cp_outgoing = (is_outgoing ? 1 : 0); - conn->c_reconnect_jiffies = 0; - INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); - INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); - INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker); - INIT_WORK(&conn->c_down_w, rds_shutdown_worker); - mutex_init(&conn->c_cm_lock); - conn->c_flags = 0; - rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", conn, &laddr, &faddr, trans->t_name ? trans->t_name : "[unknown]", @@ -224,7 +233,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, if (parent) { /* Creating passive conn */ if (parent->c_passive) { - trans->conn_free(conn->c_transport_data); + trans->conn_free(conn->c_path[0].cp_transport_data); kmem_cache_free(rds_conn_slab, conn); conn = parent->c_passive; } else { @@ -238,10 +247,26 @@ static struct rds_connection *__rds_conn_create(struct net *net, found = rds_conn_lookup(net, head, laddr, faddr, trans); if (found) { - trans->conn_free(conn->c_transport_data); + struct rds_conn_path *cp; + int i; + + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + cp = &conn->c_path[i]; + trans->conn_free(cp->cp_transport_data); + if (!trans->t_mp_capable) + break; + } kmem_cache_free(rds_conn_slab, conn); conn = found; } else { + int i; + + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + __rds_conn_path_init(conn, &conn->c_path[i], + is_outgoing); + conn->c_path[i].cp_index = i; + } + hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; -- cgit v1.2.3 From d769ef81d5b5932520fbefb02614a4380c132495 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:41 -0700 Subject: RDS: Update rds_conn_shutdown to work with rds_conn_path This commit changes rds_conn_shutdown to take a rds_conn_path * argument, allowing it to shutdown paths other than c_path[0] for MP-capable transports. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 64 ++++++++++++++++++++++++++++++---------------------- net/rds/rds.h | 5 ++-- net/rds/send.c | 9 ++------ net/rds/tcp.c | 2 +- net/rds/threads.c | 2 +- 5 files changed, 44 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index a99ac69f77ac..a88d26fd8223 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -96,14 +96,16 @@ static struct rds_connection *rds_conn_lookup(struct net *net, * and receiving over this connection again in the future. It is up to * the transport to have serialized this call with its send and recv. */ -static void rds_conn_reset(struct rds_connection *conn) +static void rds_conn_path_reset(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; + rdsdebug("connection %pI4 to %pI4 reset\n", &conn->c_laddr, &conn->c_faddr); rds_stats_inc(s_conn_reset); - rds_send_reset(conn); - conn->c_flags = 0; + rds_send_path_reset(cp); + cp->cp_flags = 0; /* Do not clear next_rx_seq here, else we cannot distinguish * retransmitted packets from new packets, and will hand all @@ -294,10 +296,12 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net, } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); -void rds_conn_shutdown(struct rds_connection *conn) +void rds_conn_shutdown(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; + /* shut it down unless it's down already */ - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { + if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) { /* * Quiesce the connection mgmt handlers before we start tearing * things down. We don't hold the mutex for the entire @@ -305,35 +309,41 @@ void rds_conn_shutdown(struct rds_connection *conn) * deadlocking with the CM handler. Instead, the CM event * handler is supposed to check for state DISCONNECTING */ - mutex_lock(&conn->c_cm_lock); - if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) - && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { - rds_conn_error(conn, "shutdown called in state %d\n", - atomic_read(&conn->c_state)); - mutex_unlock(&conn->c_cm_lock); + mutex_lock(&cp->cp_cm_lock); + if (!rds_conn_path_transition(cp, RDS_CONN_UP, + RDS_CONN_DISCONNECTING) && + !rds_conn_path_transition(cp, RDS_CONN_ERROR, + RDS_CONN_DISCONNECTING)) { + rds_conn_path_error(cp, + "shutdown called in state %d\n", + atomic_read(&cp->cp_state)); + mutex_unlock(&cp->cp_cm_lock); return; } - mutex_unlock(&conn->c_cm_lock); + mutex_unlock(&cp->cp_cm_lock); - wait_event(conn->c_waitq, - !test_bit(RDS_IN_XMIT, &conn->c_flags)); - wait_event(conn->c_waitq, - !test_bit(RDS_RECV_REFILL, &conn->c_flags)); + wait_event(cp->cp_waitq, + !test_bit(RDS_IN_XMIT, &cp->cp_flags)); + wait_event(cp->cp_waitq, + !test_bit(RDS_RECV_REFILL, &cp->cp_flags)); - conn->c_trans->conn_shutdown(conn); - rds_conn_reset(conn); + if (!conn->c_trans->t_mp_capable) + conn->c_trans->conn_shutdown(conn); + else + conn->c_trans->conn_path_shutdown(cp); + rds_conn_path_reset(cp); - if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { + if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING, + RDS_CONN_DOWN)) { /* This can happen - eg when we're in the middle of tearing * down the connection, and someone unloads the rds module. * Quite reproduceable with loopback connections. * Mostly harmless. */ - rds_conn_error(conn, - "%s: failed to transition to state DOWN, " - "current state is %d\n", - __func__, - atomic_read(&conn->c_state)); + rds_conn_path_error(cp, "%s: failed to transition " + "to state DOWN, current state " + "is %d\n", __func__, + atomic_read(&cp->cp_state)); return; } } @@ -342,13 +352,13 @@ void rds_conn_shutdown(struct rds_connection *conn) * The passive side of an IB loopback connection is never added * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ - cancel_delayed_work_sync(&conn->c_conn_w); + cancel_delayed_work_sync(&cp->cp_conn_w); rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); if (conn->c_trans->t_type != RDS_TRANS_TCP || - conn->c_path[0].cp_outgoing == 1) - rds_queue_reconnect(&conn->c_path[0]); + cp->cp_outgoing == 1) + rds_queue_reconnect(cp); } else { rcu_read_unlock(); } diff --git a/net/rds/rds.h b/net/rds/rds.h index 85f98bd88c1c..2e35b738176f 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -456,6 +456,7 @@ struct rds_transport { void (*conn_free)(void *data); int (*conn_connect)(struct rds_connection *conn); void (*conn_shutdown)(struct rds_connection *conn); + void (*conn_path_shutdown)(struct rds_conn_path *conn); void (*xmit_prepare)(struct rds_connection *conn); void (*xmit_path_prepare)(struct rds_conn_path *cp); void (*xmit_complete)(struct rds_connection *conn); @@ -653,7 +654,7 @@ struct rds_connection *rds_conn_create(struct net *net, struct rds_connection *rds_conn_create_outgoing(struct net *net, __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); -void rds_conn_shutdown(struct rds_connection *conn); +void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); void rds_conn_path_drop(struct rds_conn_path *cpath); @@ -786,7 +787,7 @@ void rds_inc_info_copy(struct rds_incoming *inc, /* send.c */ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); -void rds_send_reset(struct rds_connection *conn); +void rds_send_path_reset(struct rds_conn_path *conn); int rds_send_xmit(struct rds_conn_path *cp); struct sockaddr_in; void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); diff --git a/net/rds/send.c b/net/rds/send.c index 369bd6690218..ee43d6b2ea8f 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -62,7 +62,7 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status); * Reset the send state. Callers must ensure that this doesn't race with * rds_send_xmit(). */ -static void rds_send_path_reset(struct rds_conn_path *cp) +void rds_send_path_reset(struct rds_conn_path *cp) { struct rds_message *rm, *tmp; unsigned long flags; @@ -99,12 +99,7 @@ static void rds_send_path_reset(struct rds_conn_path *cp) list_splice_init(&cp->cp_retrans, &cp->cp_send_queue); spin_unlock_irqrestore(&cp->cp_lock, flags); } - -void rds_send_reset(struct rds_connection *conn) -{ - rds_send_path_reset(&conn->c_path[0]); -} -EXPORT_SYMBOL_GPL(rds_send_reset); +EXPORT_SYMBOL_GPL(rds_send_path_reset); static int acquire_in_xmit(struct rds_conn_path *cp) { diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 4bc1c153e93a..0e757a0d7421 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -186,7 +186,7 @@ void rds_tcp_reset_callbacks(struct socket *sock, release_sock(osock->sk); sock_release(osock); newsock: - rds_send_reset(conn); + rds_send_path_reset(&conn->c_path[0]); lock_sock(sock->sk); write_lock_bh(&sock->sk->sk_callback_lock); tc->t_sock = sock; diff --git a/net/rds/threads.c b/net/rds/threads.c index 94cca66ba5d6..9fbe95bb14a9 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -225,7 +225,7 @@ void rds_shutdown_worker(struct work_struct *work) struct rds_conn_path, cp_down_w); - rds_conn_shutdown(cp->cp_conn); + rds_conn_shutdown(cp); } void rds_threads_exit(void) -- cgit v1.2.3 From 3ecc5693c02bb154bc8609c640eb862804c4aabb Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 13 Jun 2016 09:44:42 -0700 Subject: RDS: Update rds_conn_destroy to be MP capable Refactor rds_conn_destroy() so that the per-path dismantling is done in rds_conn_path_destroy, and then iterate as needed over rds_conn_path_destroy(). Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 59 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index a88d26fd8223..a4b07c899d89 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -36,7 +36,6 @@ #include #include -#include "rds_single_path.h" #include "rds.h" #include "loop.h" @@ -364,6 +363,34 @@ void rds_conn_shutdown(struct rds_conn_path *cp) } } +/* destroy a single rds_conn_path. rds_conn_destroy() iterates over + * all paths using rds_conn_path_destroy() + */ +static void rds_conn_path_destroy(struct rds_conn_path *cp) +{ + struct rds_message *rm, *rtmp; + + rds_conn_path_drop(cp); + flush_work(&cp->cp_down_w); + + /* make sure lingering queued work won't try to ref the conn */ + cancel_delayed_work_sync(&cp->cp_send_w); + cancel_delayed_work_sync(&cp->cp_recv_w); + + /* tear down queued messages */ + list_for_each_entry_safe(rm, rtmp, + &cp->cp_send_queue, + m_conn_item) { + list_del_init(&rm->m_conn_item); + BUG_ON(!list_empty(&rm->m_sock_item)); + rds_message_put(rm); + } + if (cp->cp_xmit_rm) + rds_message_put(cp->cp_xmit_rm); + + cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); +} + /* * Stop and free a connection. * @@ -373,7 +400,6 @@ void rds_conn_shutdown(struct rds_conn_path *cp) */ void rds_conn_destroy(struct rds_connection *conn) { - struct rds_message *rm, *rtmp; unsigned long flags; rdsdebug("freeing conn %p for %pI4 -> " @@ -387,25 +413,19 @@ void rds_conn_destroy(struct rds_connection *conn) synchronize_rcu(); /* shut the connection down */ - rds_conn_drop(conn); - flush_work(&conn->c_down_w); - - /* make sure lingering queued work won't try to ref the conn */ - cancel_delayed_work_sync(&conn->c_send_w); - cancel_delayed_work_sync(&conn->c_recv_w); + if (!conn->c_trans->t_mp_capable) { + rds_conn_path_destroy(&conn->c_path[0]); + BUG_ON(!list_empty(&conn->c_path[0].cp_retrans)); + } else { + int i; + struct rds_conn_path *cp; - /* tear down queued messages */ - list_for_each_entry_safe(rm, rtmp, - &conn->c_send_queue, - m_conn_item) { - list_del_init(&rm->m_conn_item); - BUG_ON(!list_empty(&rm->m_sock_item)); - rds_message_put(rm); + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + cp = &conn->c_path[i]; + rds_conn_path_destroy(cp); + BUG_ON(!list_empty(&cp->cp_retrans)); + } } - if (conn->c_xmit_rm) - rds_message_put(conn->c_xmit_rm); - - conn->c_trans->conn_free(conn->c_transport_data); /* * The congestion maps aren't freed up here. They're @@ -414,7 +434,6 @@ void rds_conn_destroy(struct rds_connection *conn) */ rds_cong_remove_conn(conn); - BUG_ON(!list_empty(&conn->c_retrans)); kmem_cache_free(rds_conn_slab, conn); spin_lock_irqsave(&rds_conn_lock, flags); -- cgit v1.2.3 From 95df1b16074ce1e5dc4129fa206afbac32663e06 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 13 Jun 2016 10:47:43 -0700 Subject: net_sched: remove internal use of TC_POLICE_* These should be gone when we removed CONFIG_NET_CLS_POLICE. We can not totally remove them since they are exposed to userspace. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 2 +- net/sched/sch_atm.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index e7c0f4d944a2..8998a3594e86 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -243,7 +243,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, default: net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n", ret); - result = TC_POLICE_OK; + result = TC_ACT_OK; break; } spin_unlock(&ipt->tcf_lock); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 0785b239ddf9..e04ea6994d1c 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -366,7 +366,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); - result = TC_POLICE_OK; /* be nice to gcc */ + result = TC_ACT_OK; /* be nice to gcc */ flow = NULL; if (TC_H_MAJ(skb->priority) != sch->handle || !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) { @@ -403,7 +403,7 @@ done: case TC_ACT_SHOT: kfree_skb(skb); goto drop; - case TC_POLICE_RECLASSIFY: + case TC_ACT_RECLASSIFY: if (flow->excess) flow = flow->excess; else -- cgit v1.2.3 From d9fa17ef9f084c755332898c8243a396ea02d73e Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 13 Jun 2016 10:47:44 -0700 Subject: act_police: rename tcf_act_police_locate() to tcf_act_police_init() This function is just ->init(), rename it to make it obvious. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_police.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/act_police.c b/net/sched/act_police.c index ff34dd3966eb..1e8ede3955f4 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -115,9 +115,9 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RESULT] = { .type = NLA_U32 }, }; -static int tcf_act_police_locate(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, - int ovr, int bind) +static int tcf_act_police_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { int ret = 0, err; struct nlattr *tb[TCA_POLICE_MAX + 1]; @@ -366,7 +366,7 @@ static struct tc_action_ops act_police_ops = { .owner = THIS_MODULE, .act = tcf_act_police, .dump = tcf_act_police_dump, - .init = tcf_act_police_locate, + .init = tcf_act_police_init, .walk = tcf_act_police_walker, .lookup = tcf_police_search, }; -- cgit v1.2.3 From be6e6707f6eec2048d9be608bc0ceecde5bd4cef Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:32 +0100 Subject: rxrpc: Rework peer object handling to use hash table and RCU Rework peer object handling to use a hash table instead of a flat list and to use RCU. Peer objects are no longer destroyed by passing them to a workqueue to process, but rather are just passed to the RCU garbage collector as kfree'able objects. The hash function uses the local endpoint plus all the components of the remote address, except for the RxRPC service ID. Peers thus represent a UDP port on the remote machine as contacted by a UDP port on this machine. The RCU read lock is used to handle non-creating lookups so that they can be called from bottom half context in the sk_error_report handler without having to lock the hash table against modification. rxrpc_lookup_peer_rcu() *does* take a reference on the peer object as in the future, this will be passed to a work item for error distribution in the error_report path and this function will cease being used in the data_ready path. Creating lookups are done under spinlock rather than mutex as they might be set up due to an external stimulus if the local endpoint is a server. Captured network error messages (ICMP) are handled with respect to this struct and MTU size and RTT are cached here. Signed-off-by: David Howells --- net/rxrpc/Makefile | 3 +- net/rxrpc/af_rxrpc.c | 3 +- net/rxrpc/ar-internal.h | 46 ++++-- net/rxrpc/call_accept.c | 2 +- net/rxrpc/input.c | 13 +- net/rxrpc/peer_event.c | 59 +++++++- net/rxrpc/peer_object.c | 369 ++++++++++++++++++++++++------------------------ net/rxrpc/transport.c | 2 +- net/rxrpc/utils.c | 41 ++++++ 9 files changed, 335 insertions(+), 203 deletions(-) create mode 100644 net/rxrpc/utils.c (limited to 'net') diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 7e1006a3bfa5..a6f6f21d8a59 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -20,7 +20,8 @@ af-rxrpc-y := \ recvmsg.o \ security.o \ skbuff.o \ - transport.o + transport.o \ + utils.o af-rxrpc-$(CONFIG_PROC_FS) += proc.o af-rxrpc-$(CONFIG_RXKAD) += rxkad.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index a1bcb0e17250..ba373caddbeb 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -244,7 +244,7 @@ struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_sock *rx, return ERR_PTR(-EAFNOSUPPORT); /* find a remote transport endpoint from the local one */ - peer = rxrpc_get_peer(srx, gfp); + peer = rxrpc_lookup_peer(rx->local, srx, gfp); if (IS_ERR(peer)) return ERR_CAST(peer); @@ -835,7 +835,6 @@ static void __exit af_rxrpc_exit(void) rxrpc_destroy_all_calls(); rxrpc_destroy_all_connections(); rxrpc_destroy_all_transports(); - rxrpc_destroy_all_peers(); rxrpc_destroy_all_locals(); ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 03919b9a8a31..7dba6677b9d5 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -9,7 +9,9 @@ * 2 of the License, or (at your option) any later version. */ +#include #include +#include #include #if 0 @@ -193,15 +195,16 @@ struct rxrpc_local { /* * RxRPC remote transport endpoint definition - * - matched by remote port, address and protocol type - * - holds the connection ID counter for connections between the two endpoints + * - matched by local endpoint, remote port, address and protocol type */ struct rxrpc_peer { - struct work_struct destroyer; /* peer destroyer */ - struct list_head link; /* link in master peer list */ + struct rcu_head rcu; /* This must be first */ + atomic_t usage; + unsigned long hash_key; + struct hlist_node hash_link; + struct rxrpc_local *local; struct list_head error_targets; /* targets for net error distribution */ spinlock_t lock; /* access lock */ - atomic_t usage; unsigned int if_mtu; /* interface MTU for this peer */ unsigned int mtu; /* network MTU for this peer */ unsigned int maxdata; /* data size (MTU - hdrsize) */ @@ -611,10 +614,29 @@ void rxrpc_UDP_error_handler(struct work_struct *); /* * peer_object.c */ -struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *, gfp_t); -void rxrpc_put_peer(struct rxrpc_peer *); -struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *, __be32, __be16); -void __exit rxrpc_destroy_all_peers(void); +struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, + const struct sockaddr_rxrpc *); +struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, + struct sockaddr_rxrpc *, gfp_t); +struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); + +static inline void rxrpc_get_peer(struct rxrpc_peer *peer) +{ + atomic_inc(&peer->usage); +} + +static inline +struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) +{ + return atomic_inc_not_zero(&peer->usage) ? peer : NULL; +} + +extern void __rxrpc_put_peer(struct rxrpc_peer *peer); +static inline void rxrpc_put_peer(struct rxrpc_peer *peer) +{ + if (atomic_dec_and_test(&peer->usage)) + __rxrpc_put_peer(peer); +} /* * proc.c @@ -672,6 +694,12 @@ void __exit rxrpc_destroy_all_transports(void); struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *, struct rxrpc_peer *); +/* + * utils.c + */ +void rxrpc_get_addr_from_skb(struct rxrpc_local *, const struct sk_buff *, + struct sockaddr_rxrpc *); + /* * debug tracing */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index eea5f4a5d8b1..e5723f4dce89 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -95,7 +95,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, rxrpc_new_skb(notification); notification->mark = RXRPC_SKB_MARK_NEW_CALL; - peer = rxrpc_get_peer(srx, GFP_NOIO); + peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); if (IS_ERR(peer)) { _debug("no peer"); ret = -EBUSY; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index e0815a033999..3b405dbf3a05 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -635,14 +635,16 @@ static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, struct rxrpc_peer *peer; struct rxrpc_transport *trans; struct rxrpc_connection *conn; + struct sockaddr_rxrpc srx; - peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr, - udp_hdr(skb)->source); + rxrpc_get_addr_from_skb(local, skb, &srx); + rcu_read_lock(); + peer = rxrpc_lookup_peer_rcu(local, &srx); if (IS_ERR(peer)) - goto cant_find_conn; + goto cant_find_peer; trans = rxrpc_find_transport(local, peer); - rxrpc_put_peer(peer); + rcu_read_unlock(); if (!trans) goto cant_find_conn; @@ -652,6 +654,9 @@ static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, goto cant_find_conn; return conn; + +cant_find_peer: + rcu_read_unlock(); cant_find_conn: return NULL; } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 3e82d6f0313c..24f5ec0fcd20 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -22,6 +22,55 @@ #include #include "ar-internal.h" +/* + * Find the peer associated with an ICMP packet. + */ +static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, + const struct sk_buff *skb) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct sockaddr_rxrpc srx; + + _enter(""); + + memset(&srx, 0, sizeof(srx)); + srx.transport_type = local->srx.transport_type; + srx.transport.family = local->srx.transport.family; + + /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice + * versa? + */ + switch (srx.transport.family) { + case AF_INET: + srx.transport.sin.sin_port = serr->port; + srx.transport_len = sizeof(struct sockaddr_in); + switch (serr->ee.ee_origin) { + case SO_EE_ORIGIN_ICMP: + _net("Rx ICMP"); + memcpy(&srx.transport.sin.sin_addr, + skb_network_header(skb) + serr->addr_offset, + sizeof(struct in_addr)); + break; + case SO_EE_ORIGIN_ICMP6: + _net("Rx ICMP6 on v4 sock"); + memcpy(&srx.transport.sin.sin_addr, + skb_network_header(skb) + serr->addr_offset + 12, + sizeof(struct in_addr)); + break; + default: + memcpy(&srx.transport.sin.sin_addr, &ip_hdr(skb)->saddr, + sizeof(struct in_addr)); + break; + } + break; + + default: + BUG(); + } + + return rxrpc_lookup_peer_rcu(local, &srx); +} + /* * handle an error received on the local endpoint */ @@ -57,8 +106,12 @@ void rxrpc_UDP_error_report(struct sock *sk) _net("Rx UDP Error from %pI4:%hu", &addr, ntohs(port)); _debug("Msg l:%d d:%d", skb->len, skb->data_len); - peer = rxrpc_find_peer(local, addr, port); - if (IS_ERR(peer)) { + rcu_read_lock(); + peer = rxrpc_lookup_peer_icmp_rcu(local, skb); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + rcu_read_unlock(); rxrpc_free_skb(skb); _leave(" [no peer]"); return; @@ -66,6 +119,7 @@ void rxrpc_UDP_error_report(struct sock *sk) trans = rxrpc_find_transport(local, peer); if (!trans) { + rcu_read_unlock(); rxrpc_put_peer(peer); rxrpc_free_skb(skb); _leave(" [no trans]"); @@ -110,6 +164,7 @@ void rxrpc_UDP_error_report(struct sock *sk) } } + rcu_read_unlock(); rxrpc_put_peer(peer); /* pass the transport ref to error_handler to release */ diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 0b54cda3d8e5..7fc50dc7d333 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -1,6 +1,6 @@ -/* RxRPC remote transport endpoint management +/* RxRPC remote transport endpoint record management * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -16,20 +16,132 @@ #include #include #include -#include -#include #include +#include #include #include #include #include #include "ar-internal.h" -static LIST_HEAD(rxrpc_peers); -static DEFINE_RWLOCK(rxrpc_peer_lock); -static DECLARE_WAIT_QUEUE_HEAD(rxrpc_peer_wq); +static DEFINE_HASHTABLE(rxrpc_peer_hash, 10); +static DEFINE_SPINLOCK(rxrpc_peer_hash_lock); -static void rxrpc_destroy_peer(struct work_struct *work); +/* + * Hash a peer key. + */ +static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local, + const struct sockaddr_rxrpc *srx) +{ + const u16 *p; + unsigned int i, size; + unsigned long hash_key; + + _enter(""); + + hash_key = (unsigned long)local / __alignof__(*local); + hash_key += srx->transport_type; + hash_key += srx->transport_len; + hash_key += srx->transport.family; + + switch (srx->transport.family) { + case AF_INET: + hash_key += (u16 __force)srx->transport.sin.sin_port; + size = sizeof(srx->transport.sin.sin_addr); + p = (u16 *)&srx->transport.sin.sin_addr; + break; + } + + /* Step through the peer address in 16-bit portions for speed */ + for (i = 0; i < size; i += sizeof(*p), p++) + hash_key += *p; + + _leave(" 0x%lx", hash_key); + return hash_key; +} + +/* + * Compare a peer to a key. Return -ve, 0 or +ve to indicate less than, same + * or greater than. + * + * Unfortunately, the primitives in linux/hashtable.h don't allow for sorted + * buckets and mid-bucket insertion, so we don't make full use of this + * information at this point. + */ +static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer, + struct rxrpc_local *local, + const struct sockaddr_rxrpc *srx, + unsigned long hash_key) +{ + long diff; + + diff = ((peer->hash_key - hash_key) ?: + ((unsigned long)peer->local - (unsigned long)local) ?: + (peer->srx.transport_type - srx->transport_type) ?: + (peer->srx.transport_len - srx->transport_len) ?: + (peer->srx.transport.family - srx->transport.family)); + if (diff != 0) + return diff; + + switch (srx->transport.family) { + case AF_INET: + return ((u16 __force)peer->srx.transport.sin.sin_port - + (u16 __force)srx->transport.sin.sin_port) ?: + memcmp(&peer->srx.transport.sin.sin_addr, + &srx->transport.sin.sin_addr, + sizeof(struct in_addr)); + default: + BUG(); + } +} + +/* + * Look up a remote transport endpoint for the specified address using RCU. + */ +static struct rxrpc_peer *__rxrpc_lookup_peer_rcu( + struct rxrpc_local *local, + const struct sockaddr_rxrpc *srx, + unsigned long hash_key) +{ + struct rxrpc_peer *peer; + + hash_for_each_possible_rcu(rxrpc_peer_hash, peer, hash_link, hash_key) { + if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0) { + if (atomic_read(&peer->usage) == 0) + return NULL; + return peer; + } + } + + return NULL; +} + +/* + * Look up a remote transport endpoint for the specified address using RCU. + */ +struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, + const struct sockaddr_rxrpc *srx) +{ + struct rxrpc_peer *peer; + unsigned long hash_key = rxrpc_peer_hash_key(local, srx); + + peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); + if (peer) { + switch (srx->transport.family) { + case AF_INET: + _net("PEER %d {%d,%u,%pI4+%hu}", + peer->debug_id, + peer->srx.transport_type, + peer->srx.transport.family, + &peer->srx.transport.sin.sin_addr, + ntohs(peer->srx.transport.sin.sin_port)); + break; + } + + _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage)); + } + return peer; +} /* * assess the MTU size for the network interface through which this peer is @@ -58,10 +170,9 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) } /* - * allocate a new peer + * Allocate a peer. */ -static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx, - gfp_t gfp) +struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) { struct rxrpc_peer *peer; @@ -69,12 +180,32 @@ static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx, peer = kzalloc(sizeof(struct rxrpc_peer), gfp); if (peer) { - INIT_WORK(&peer->destroyer, &rxrpc_destroy_peer); - INIT_LIST_HEAD(&peer->link); + atomic_set(&peer->usage, 1); + peer->local = local; INIT_LIST_HEAD(&peer->error_targets); spin_lock_init(&peer->lock); - atomic_set(&peer->usage, 1); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); + } + + _leave(" = %p", peer); + return peer; +} + +/* + * Set up a new peer. + */ +static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, + struct sockaddr_rxrpc *srx, + unsigned long hash_key, + gfp_t gfp) +{ + struct rxrpc_peer *peer; + + _enter(""); + + peer = rxrpc_alloc_peer(local, gfp); + if (peer) { + peer->hash_key = hash_key; memcpy(&peer->srx, srx, sizeof(*srx)); rxrpc_assess_MTU_size(peer); @@ -105,11 +236,11 @@ static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx, /* * obtain a remote transport endpoint for the specified address */ -struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *srx, gfp_t gfp) +struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, + struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_peer *peer, *candidate; - const char *new = "old"; - int usage; + unsigned long hash_key = rxrpc_peer_hash_key(local, srx); _enter("{%d,%d,%pI4+%hu}", srx->transport_type, @@ -118,188 +249,60 @@ struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *srx, gfp_t gfp) ntohs(srx->transport.sin.sin_port)); /* search the peer list first */ - read_lock_bh(&rxrpc_peer_lock); - list_for_each_entry(peer, &rxrpc_peers, link) { - _debug("check PEER %d { u=%d t=%d l=%d }", - peer->debug_id, - atomic_read(&peer->usage), - peer->srx.transport_type, - peer->srx.transport_len); - - if (atomic_read(&peer->usage) > 0 && - peer->srx.transport_type == srx->transport_type && - peer->srx.transport_len == srx->transport_len && - memcmp(&peer->srx.transport, - &srx->transport, - srx->transport_len) == 0) - goto found_extant_peer; - } - read_unlock_bh(&rxrpc_peer_lock); - - /* not yet present - create a candidate for a new record and then - * redo the search */ - candidate = rxrpc_alloc_peer(srx, gfp); - if (!candidate) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } + rcu_read_lock(); + peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + rcu_read_unlock(); + + if (!peer) { + /* The peer is not yet present in hash - create a candidate + * for a new record and then redo the search. + */ + candidate = rxrpc_create_peer(local, srx, hash_key, gfp); + if (!candidate) { + _leave(" = NULL [nomem]"); + return NULL; + } - write_lock_bh(&rxrpc_peer_lock); + spin_lock(&rxrpc_peer_hash_lock); - list_for_each_entry(peer, &rxrpc_peers, link) { - if (atomic_read(&peer->usage) > 0 && - peer->srx.transport_type == srx->transport_type && - peer->srx.transport_len == srx->transport_len && - memcmp(&peer->srx.transport, - &srx->transport, - srx->transport_len) == 0) - goto found_extant_second; - } + /* Need to check that we aren't racing with someone else */ + peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) + hash_add_rcu(rxrpc_peer_hash, + &candidate->hash_link, hash_key); - /* we can now add the new candidate to the list */ - peer = candidate; - candidate = NULL; - usage = atomic_read(&peer->usage); + spin_unlock(&rxrpc_peer_hash_lock); - list_add_tail(&peer->link, &rxrpc_peers); - write_unlock_bh(&rxrpc_peer_lock); - new = "new"; + if (peer) + kfree(candidate); + else + peer = candidate; + } -success: - _net("PEER %s %d {%d,%u,%pI4+%hu}", - new, + _net("PEER %d {%d,%pI4+%hu}", peer->debug_id, peer->srx.transport_type, - peer->srx.transport.family, &peer->srx.transport.sin.sin_addr, ntohs(peer->srx.transport.sin.sin_port)); - _leave(" = %p {u=%d}", peer, usage); - return peer; - - /* we found the peer in the list immediately */ -found_extant_peer: - usage = atomic_inc_return(&peer->usage); - read_unlock_bh(&rxrpc_peer_lock); - goto success; - - /* we found the peer on the second time through the list */ -found_extant_second: - usage = atomic_inc_return(&peer->usage); - write_unlock_bh(&rxrpc_peer_lock); - kfree(candidate); - goto success; -} - -/* - * find the peer associated with a packet - */ -struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *local, - __be32 addr, __be16 port) -{ - struct rxrpc_peer *peer; - - _enter(""); - - /* search the peer list */ - read_lock_bh(&rxrpc_peer_lock); - - if (local->srx.transport.family == AF_INET && - local->srx.transport_type == SOCK_DGRAM - ) { - list_for_each_entry(peer, &rxrpc_peers, link) { - if (atomic_read(&peer->usage) > 0 && - peer->srx.transport_type == SOCK_DGRAM && - peer->srx.transport.family == AF_INET && - peer->srx.transport.sin.sin_port == port && - peer->srx.transport.sin.sin_addr.s_addr == addr) - goto found_UDP_peer; - } - - goto new_UDP_peer; - } - - read_unlock_bh(&rxrpc_peer_lock); - _leave(" = -EAFNOSUPPORT"); - return ERR_PTR(-EAFNOSUPPORT); - -found_UDP_peer: - _net("Rx UDP DGRAM from peer %d", peer->debug_id); - atomic_inc(&peer->usage); - read_unlock_bh(&rxrpc_peer_lock); - _leave(" = %p", peer); + _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage)); return peer; - -new_UDP_peer: - _net("Rx UDP DGRAM from NEW peer"); - read_unlock_bh(&rxrpc_peer_lock); - _leave(" = -EBUSY [new]"); - return ERR_PTR(-EBUSY); } /* - * release a remote transport endpoint + * Discard a ref on a remote peer record. */ -void rxrpc_put_peer(struct rxrpc_peer *peer) +void __rxrpc_put_peer(struct rxrpc_peer *peer) { - _enter("%p{u=%d}", peer, atomic_read(&peer->usage)); + ASSERT(list_empty(&peer->error_targets)); - ASSERTCMP(atomic_read(&peer->usage), >, 0); - - if (likely(!atomic_dec_and_test(&peer->usage))) { - _leave(" [in use]"); - return; - } - - rxrpc_queue_work(&peer->destroyer); - _leave(""); -} - -/* - * destroy a remote transport endpoint - */ -static void rxrpc_destroy_peer(struct work_struct *work) -{ - struct rxrpc_peer *peer = - container_of(work, struct rxrpc_peer, destroyer); - - _enter("%p{%d}", peer, atomic_read(&peer->usage)); - - write_lock_bh(&rxrpc_peer_lock); - list_del(&peer->link); - write_unlock_bh(&rxrpc_peer_lock); - - _net("DESTROY PEER %d", peer->debug_id); - kfree(peer); - - if (list_empty(&rxrpc_peers)) - wake_up_all(&rxrpc_peer_wq); - _leave(""); -} - -/* - * preemptively destroy all the peer records from a transport endpoint rather - * than waiting for them to time out - */ -void __exit rxrpc_destroy_all_peers(void) -{ - DECLARE_WAITQUEUE(myself,current); - - _enter(""); - - /* we simply have to wait for them to go away */ - if (!list_empty(&rxrpc_peers)) { - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&rxrpc_peer_wq, &myself); - - while (!list_empty(&rxrpc_peers)) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } - - remove_wait_queue(&rxrpc_peer_wq, &myself); - set_current_state(TASK_RUNNING); - } + spin_lock(&rxrpc_peer_hash_lock); + hash_del_rcu(&peer->hash_link); + spin_unlock(&rxrpc_peer_hash_lock); - _leave(""); + kfree_rcu(peer, rcu); } diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index a1b65183b07d..d33387dec0ce 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c @@ -121,7 +121,7 @@ struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *local, usage = atomic_read(&trans->usage); rxrpc_get_local(trans->local); - atomic_inc(&trans->peer->usage); + rxrpc_get_peer(trans->peer); list_add_tail(&trans->link, &rxrpc_transports); write_unlock_bh(&rxrpc_transport_lock); new = "new"; diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c new file mode 100644 index 000000000000..f28122a15a24 --- /dev/null +++ b/net/rxrpc/utils.c @@ -0,0 +1,41 @@ +/* Utility routines + * + * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include "ar-internal.h" + +/* + * Set up an RxRPC address from a socket buffer. + */ +void rxrpc_get_addr_from_skb(struct rxrpc_local *local, + const struct sk_buff *skb, + struct sockaddr_rxrpc *srx) +{ + memset(srx, 0, sizeof(*srx)); + srx->transport_type = local->srx.transport_type; + srx->transport.family = local->srx.transport.family; + + /* Can we see an ipv4 UDP packet on an ipv6 UDP socket? and vice + * versa? + */ + switch (srx->transport.family) { + case AF_INET: + srx->transport.sin.sin_port = udp_hdr(skb)->source; + srx->transport_len = sizeof(struct sockaddr_in); + memcpy(&srx->transport.sin.sin_addr, &ip_hdr(skb)->saddr, + sizeof(struct in_addr)); + break; + + default: + BUG(); + } +} -- cgit v1.2.3 From abe89ef0ed1a50ef6186d9aee433b995641a1293 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:32 +0100 Subject: rxrpc: Rename rxrpc_UDP_error_report() to rxrpc_error_report() Rename rxrpc_UDP_error_report() to rxrpc_error_report() as it might get called for something other than UDP. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 4 ++-- net/rxrpc/local_object.c | 2 +- net/rxrpc/peer_event.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7dba6677b9d5..1e5c15632f49 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -606,9 +606,9 @@ int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* - * peer_error.c + * peer_event.c */ -void rxrpc_UDP_error_report(struct sock *); +void rxrpc_error_report(struct sock *); void rxrpc_UDP_error_handler(struct work_struct *); /* diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 111f250b045f..28f9efb3118f 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -120,7 +120,7 @@ static int rxrpc_create_local(struct rxrpc_local *local) sock = local->socket->sk; sock->sk_user_data = local; sock->sk_data_ready = rxrpc_data_ready; - sock->sk_error_report = rxrpc_UDP_error_report; + sock->sk_error_report = rxrpc_error_report; _leave(" = 0"); return 0; diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 24f5ec0fcd20..2c2df3a5d1b9 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -74,7 +74,7 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, /* * handle an error received on the local endpoint */ -void rxrpc_UDP_error_report(struct sock *sk) +void rxrpc_error_report(struct sock *sk) { struct sock_exterr_skb *serr; struct rxrpc_transport *trans; -- cgit v1.2.3 From 1a70c05bad1383fdda95e713baee5f76c4726d24 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:33 +0100 Subject: rxrpc: Break MTU determination from ICMP into its own function Break MTU determination from ICMP out into its own function to reduce the complexity of the error report handler. Signed-off-by: David Howells --- net/rxrpc/peer_event.c | 93 +++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 2c2df3a5d1b9..80de84257227 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -71,6 +71,45 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, return rxrpc_lookup_peer_rcu(local, &srx); } +/* + * Handle an MTU/fragmentation problem. + */ +static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) +{ + u32 mtu = serr->ee.ee_info; + + _net("Rx ICMP Fragmentation Needed (%d)", mtu); + + /* wind down the local interface MTU */ + if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) { + peer->if_mtu = mtu; + _net("I/F MTU %u", mtu); + } + + if (mtu == 0) { + /* they didn't give us a size, estimate one */ + mtu = peer->if_mtu; + if (mtu > 1500) { + mtu >>= 1; + if (mtu < 1500) + mtu = 1500; + } else { + mtu -= 100; + if (mtu < peer->hdrsize) + mtu = peer->hdrsize + 4; + } + } + + if (mtu < peer->mtu) { + spin_lock_bh(&peer->lock); + peer->mtu = mtu; + peer->maxdata = peer->mtu - peer->hdrsize; + spin_unlock_bh(&peer->lock); + _net("Net MTU %u (maxdata %u)", + peer->mtu, peer->maxdata); + } +} + /* * handle an error received on the local endpoint */ @@ -126,50 +165,26 @@ void rxrpc_error_report(struct sock *sk) return; } - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && - serr->ee.ee_type == ICMP_DEST_UNREACH && - serr->ee.ee_code == ICMP_FRAG_NEEDED - ) { - u32 mtu = serr->ee.ee_info; - - _net("Rx Received ICMP Fragmentation Needed (%d)", mtu); - - /* wind down the local interface MTU */ - if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) { - peer->if_mtu = mtu; - _net("I/F MTU %u", mtu); - } - - if (mtu == 0) { - /* they didn't give us a size, estimate one */ - mtu = peer->if_mtu; - if (mtu > 1500) { - mtu >>= 1; - if (mtu < 1500) - mtu = 1500; - } else { - mtu -= 100; - if (mtu < peer->hdrsize) - mtu = peer->hdrsize + 4; - } - } - - if (mtu < peer->mtu) { - spin_lock_bh(&peer->lock); - peer->mtu = mtu; - peer->maxdata = peer->mtu - peer->hdrsize; - spin_unlock_bh(&peer->lock); - _net("Net MTU %u (maxdata %u)", - peer->mtu, peer->maxdata); - } + if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && + serr->ee.ee_type == ICMP_DEST_UNREACH && + serr->ee.ee_code == ICMP_FRAG_NEEDED)) { + rxrpc_adjust_mtu(peer, serr); + rxrpc_free_skb(skb); + skb = NULL; + goto out; } +out: rcu_read_unlock(); rxrpc_put_peer(peer); - /* pass the transport ref to error_handler to release */ - skb_queue_tail(&trans->error_queue, skb); - rxrpc_queue_work(&trans->error_handler); + if (skb) { + /* pass the transport ref to error_handler to release */ + skb_queue_tail(&trans->error_queue, skb); + rxrpc_queue_work(&trans->error_handler); + } else { + rxrpc_put_transport(trans); + } _leave(""); } -- cgit v1.2.3 From 1c1df86fad68dd7188ea498e796c9d2ede679421 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:33 +0100 Subject: rxrpc: Don't assume anything about the address in an ICMP packet Don't assume anything about the address in an ICMP packet in rxrpc_error_report() as the address may not be IPv4 in future, especially since we're just printing these details. Signed-off-by: David Howells --- net/rxrpc/peer_event.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net') diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 80de84257227..6ba798d6659e 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -120,8 +120,6 @@ void rxrpc_error_report(struct sock *sk) struct rxrpc_local *local = sk->sk_user_data; struct rxrpc_peer *peer; struct sk_buff *skb; - __be32 addr; - __be16 port; _enter("%p{%d}", sk, local->debug_id); @@ -139,12 +137,6 @@ void rxrpc_error_report(struct sock *sk) rxrpc_new_skb(skb); - addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); - port = serr->port; - - _net("Rx UDP Error from %pI4:%hu", &addr, ntohs(port)); - _debug("Msg l:%d d:%d", skb->len, skb->data_len); - rcu_read_lock(); peer = rxrpc_lookup_peer_icmp_rcu(local, skb); if (peer && !rxrpc_get_peer_maybe(peer)) -- cgit v1.2.3 From fe77d5fc5ab33bb088cf8448767a77fdc32e08d1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:34 +0100 Subject: rxrpc: Do a little bit of tidying in the ICMP processing Do a little bit of tidying in the ICMP processing code. Signed-off-by: David Howells --- net/rxrpc/peer_event.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 6ba798d6659e..31c440acd8c9 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -245,15 +245,13 @@ void rxrpc_UDP_error_handler(struct work_struct *work) break; case SO_EE_ORIGIN_LOCAL: - _proto("Rx Received local error { error=%d }", - ee->ee_errno); + _proto("Rx Received local error { error=%d }", err); break; case SO_EE_ORIGIN_NONE: case SO_EE_ORIGIN_ICMP6: default: - _proto("Rx Received error report { orig=%u }", - ee->ee_origin); + _proto("Rx Received error report { orig=%u }", ee->ee_origin); break; } -- cgit v1.2.3 From f66d7490196055cb9fb058f8936d19111a6231b9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:34 +0100 Subject: rxrpc: Use the peer record to distribute network errors Use the peer record to distribute network errors rather than the transport object (which I want to get rid of). An error from a particular peer terminates all calls on that peer. For future consideration: (1) For ICMP-induced errors it might be worth trying to extract the RxRPC header from the offending packet, if one is returned attached to the ICMP packet, to better direct the error. This may be overkill, though, since an ICMP packet would be expected to be relating to the destination port, machine or network. RxRPC ABORT and BUSY packets give notice at RxRPC level. (2) To also abort connection-level communications (such as CHALLENGE packets) where indicted by an error - but that requires some revamping of the connection event handling first. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 16 +++---- net/rxrpc/call_event.c | 15 +++++-- net/rxrpc/call_object.c | 6 +-- net/rxrpc/output.c | 4 +- net/rxrpc/peer_event.c | 109 ++++++++++++++++++++++-------------------------- net/rxrpc/peer_object.c | 6 ++- net/rxrpc/transport.c | 17 -------- 7 files changed, 79 insertions(+), 94 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1e5c15632f49..a63bb7518fb5 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -189,7 +189,6 @@ struct rxrpc_local { rwlock_t services_lock; /* lock for services list */ atomic_t usage; int debug_id; /* debug ID for printks */ - volatile char error_rcvd; /* T if received ICMP error outstanding */ struct sockaddr_rxrpc srx; /* local address */ }; @@ -203,14 +202,16 @@ struct rxrpc_peer { unsigned long hash_key; struct hlist_node hash_link; struct rxrpc_local *local; - struct list_head error_targets; /* targets for net error distribution */ + struct hlist_head error_targets; /* targets for net error distribution */ + struct work_struct error_distributor; spinlock_t lock; /* access lock */ unsigned int if_mtu; /* interface MTU for this peer */ unsigned int mtu; /* network MTU for this peer */ unsigned int maxdata; /* data size (MTU - hdrsize) */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ - int net_error; /* network error distributed */ + int error_report; /* Net (+0) or local (+1000000) to distribute */ +#define RXRPC_LOCAL_ERROR_OFFSET 1000000 struct sockaddr_rxrpc srx; /* remote address */ /* calculated RTT cache */ @@ -229,12 +230,10 @@ struct rxrpc_peer { struct rxrpc_transport { struct rxrpc_local *local; /* local transport endpoint */ struct rxrpc_peer *peer; /* remote transport endpoint */ - struct work_struct error_handler; /* network error distributor */ struct rb_root bundles; /* client connection bundles on this transport */ struct rb_root client_conns; /* client connections on this transport */ struct rb_root server_conns; /* server connections on this transport */ struct list_head link; /* link in master session list */ - struct sk_buff_head error_queue; /* error packets awaiting processing */ unsigned long put_time; /* time at which to reap */ spinlock_t client_lock; /* client connection allocation lock */ rwlock_t conn_lock; /* lock for active/dead connections */ @@ -393,7 +392,7 @@ struct rxrpc_call { struct work_struct destroyer; /* call destroyer */ struct work_struct processor; /* packet processor and ACK generator */ struct list_head link; /* link in master call list */ - struct list_head error_link; /* link in error distribution list */ + struct hlist_node error_link; /* link in error distribution list */ struct list_head accept_link; /* calls awaiting acceptance */ struct rb_node sock_node; /* node in socket call tree */ struct rb_node conn_node; /* node in connection call tree */ @@ -411,7 +410,8 @@ struct rxrpc_call { atomic_t sequence; /* Tx data packet sequence counter */ u32 local_abort; /* local abort code */ u32 remote_abort; /* remote abort code */ - int error; /* local error incurred */ + int error_report; /* Network error (ICMP/local transport) */ + int error; /* Local error incurred */ enum rxrpc_call_state state : 8; /* current state of call */ int debug_id; /* debug ID for printks */ u8 channel; /* connection channel occupied by this call */ @@ -609,7 +609,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); * peer_event.c */ void rxrpc_error_report(struct sock *); -void rxrpc_UDP_error_handler(struct work_struct *); +void rxrpc_peer_error_distributor(struct work_struct *); /* * peer_object.c diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 18381783c2b1..e610b106c913 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -864,17 +864,24 @@ void rxrpc_process_call(struct work_struct *work) } if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) { + enum rxrpc_skb_mark mark; int error; clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); clear_bit(RXRPC_CALL_EV_ABORT, &call->events); - error = call->conn->trans->peer->net_error; - _debug("post net error %d", error); + error = call->error_report; + if (error < RXRPC_LOCAL_ERROR_OFFSET) { + mark = RXRPC_SKB_MARK_NET_ERROR; + _debug("post net error %d", error); + } else { + mark = RXRPC_SKB_MARK_LOCAL_ERROR; + error -= RXRPC_LOCAL_ERROR_OFFSET; + _debug("post net local error %d", error); + } - if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR, - error, true) < 0) + if (rxrpc_post_message(call, mark, error, true) < 0) goto no_mem; clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); goto kill_ACKs; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 68125dc4cb7c..8b4d47b3ccac 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -334,7 +334,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call( rxrpc_call_hash_add(call); spin_lock(&call->conn->trans->peer->lock); - list_add(&call->error_link, &call->conn->trans->peer->error_targets); + hlist_add_head(&call->error_link, &call->conn->trans->peer->error_targets); spin_unlock(&call->conn->trans->peer->lock); call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; @@ -516,7 +516,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, write_unlock_bh(&conn->lock); spin_lock(&conn->trans->peer->lock); - list_add(&call->error_link, &conn->trans->peer->error_targets); + hlist_add_head(&call->error_link, &conn->trans->peer->error_targets); spin_unlock(&conn->trans->peer->lock); write_lock_bh(&rxrpc_call_lock); @@ -812,7 +812,7 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call) if (call->conn) { spin_lock(&call->conn->trans->peer->lock); - list_del(&call->error_link); + hlist_del_init(&call->error_link); spin_unlock(&call->conn->trans->peer->lock); write_lock_bh(&call->conn->lock); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 2e3c4064e29c..e6fb3863b0bc 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -707,7 +707,9 @@ out: call_aborted: rxrpc_free_skb(skb); if (call->state == RXRPC_CALL_NETWORK_ERROR) - ret = call->conn->trans->peer->net_error; + ret = call->error_report < RXRPC_LOCAL_ERROR_OFFSET ? + call->error_report : + call->error_report - RXRPC_LOCAL_ERROR_OFFSET; else ret = -ECONNABORTED; _leave(" = %d", ret); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 31c440acd8c9..8940674b5e08 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -1,4 +1,4 @@ -/* Error message handling (ICMP) +/* Peer event handling, typically ICMP messages. * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -22,6 +22,8 @@ #include #include "ar-internal.h" +static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); + /* * Find the peer associated with an ICMP packet. */ @@ -111,12 +113,11 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, struct sock_exterr_skb *se } /* - * handle an error received on the local endpoint + * Handle an error received on the local endpoint. */ void rxrpc_error_report(struct sock *sk) { struct sock_exterr_skb *serr; - struct rxrpc_transport *trans; struct rxrpc_local *local = sk->sk_user_data; struct rxrpc_peer *peer; struct sk_buff *skb; @@ -148,57 +149,37 @@ void rxrpc_error_report(struct sock *sk) return; } - trans = rxrpc_find_transport(local, peer); - if (!trans) { - rcu_read_unlock(); - rxrpc_put_peer(peer); - rxrpc_free_skb(skb); - _leave(" [no trans]"); - return; - } - if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && serr->ee.ee_type == ICMP_DEST_UNREACH && serr->ee.ee_code == ICMP_FRAG_NEEDED)) { rxrpc_adjust_mtu(peer, serr); + rcu_read_unlock(); rxrpc_free_skb(skb); - skb = NULL; - goto out; + rxrpc_put_peer(peer); + _leave(" [MTU update]"); + return; } -out: + rxrpc_store_error(peer, serr); rcu_read_unlock(); - rxrpc_put_peer(peer); + rxrpc_free_skb(skb); - if (skb) { - /* pass the transport ref to error_handler to release */ - skb_queue_tail(&trans->error_queue, skb); - rxrpc_queue_work(&trans->error_handler); - } else { - rxrpc_put_transport(trans); - } + /* The ref we obtained is passed off to the work item */ + rxrpc_queue_work(&peer->error_distributor); _leave(""); } /* - * deal with UDP error messages + * Map an error report to error codes on the peer record. */ -void rxrpc_UDP_error_handler(struct work_struct *work) +static void rxrpc_store_error(struct rxrpc_peer *peer, + struct sock_exterr_skb *serr) { struct sock_extended_err *ee; - struct sock_exterr_skb *serr; - struct rxrpc_transport *trans = - container_of(work, struct rxrpc_transport, error_handler); - struct sk_buff *skb; int err; _enter(""); - skb = skb_dequeue(&trans->error_queue); - if (!skb) - return; - - serr = SKB_EXT_ERR(skb); ee = &serr->ee; _net("Rx Error o=%d t=%d c=%d e=%d", @@ -244,47 +225,57 @@ void rxrpc_UDP_error_handler(struct work_struct *work) } break; + case SO_EE_ORIGIN_NONE: case SO_EE_ORIGIN_LOCAL: _proto("Rx Received local error { error=%d }", err); + err += RXRPC_LOCAL_ERROR_OFFSET; break; - case SO_EE_ORIGIN_NONE: case SO_EE_ORIGIN_ICMP6: default: _proto("Rx Received error report { orig=%u }", ee->ee_origin); break; } - /* terminate all the affected calls if there's an unrecoverable - * error */ - if (err) { - struct rxrpc_call *call, *_n; + peer->error_report = err; +} + +/* + * Distribute an error that occurred on a peer + */ +void rxrpc_peer_error_distributor(struct work_struct *work) +{ + struct rxrpc_peer *peer = + container_of(work, struct rxrpc_peer, error_distributor); + struct rxrpc_call *call; + int error_report; + + _enter(""); - _debug("ISSUE ERROR %d", err); + error_report = READ_ONCE(peer->error_report); - spin_lock_bh(&trans->peer->lock); - trans->peer->net_error = err; + _debug("ISSUE ERROR %d", error_report); - list_for_each_entry_safe(call, _n, &trans->peer->error_targets, - error_link) { - write_lock(&call->state_lock); - if (call->state != RXRPC_CALL_COMPLETE && - call->state < RXRPC_CALL_NETWORK_ERROR) { - call->state = RXRPC_CALL_NETWORK_ERROR; - set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); - rxrpc_queue_call(call); - } - write_unlock(&call->state_lock); - list_del_init(&call->error_link); - } + spin_lock_bh(&peer->lock); - spin_unlock_bh(&trans->peer->lock); + while (!hlist_empty(&peer->error_targets)) { + call = hlist_entry(peer->error_targets.first, + struct rxrpc_call, error_link); + hlist_del_init(&call->error_link); + + write_lock(&call->state_lock); + if (call->state != RXRPC_CALL_COMPLETE && + call->state < RXRPC_CALL_NETWORK_ERROR) { + call->error_report = error_report; + call->state = RXRPC_CALL_NETWORK_ERROR; + set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); } - if (!skb_queue_empty(&trans->error_queue)) - rxrpc_queue_work(&trans->error_handler); + spin_unlock_bh(&peer->lock); - rxrpc_free_skb(skb); - rxrpc_put_transport(trans); + rxrpc_put_peer(peer); _leave(""); } diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 7fc50dc7d333..faf222c21698 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -182,7 +182,9 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) if (peer) { atomic_set(&peer->usage, 1); peer->local = local; - INIT_LIST_HEAD(&peer->error_targets); + INIT_HLIST_HEAD(&peer->error_targets); + INIT_WORK(&peer->error_distributor, + &rxrpc_peer_error_distributor); spin_lock_init(&peer->lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); } @@ -298,7 +300,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, */ void __rxrpc_put_peer(struct rxrpc_peer *peer) { - ASSERT(list_empty(&peer->error_targets)); + ASSERT(hlist_empty(&peer->error_targets)); spin_lock(&rxrpc_peer_hash_lock); hash_del_rcu(&peer->hash_link); diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index d33387dec0ce..24c71218a6f8 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c @@ -49,26 +49,11 @@ static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local, trans->bundles = RB_ROOT; trans->client_conns = RB_ROOT; trans->server_conns = RB_ROOT; - skb_queue_head_init(&trans->error_queue); spin_lock_init(&trans->client_lock); rwlock_init(&trans->conn_lock); atomic_set(&trans->usage, 1); trans->conn_idcounter = peer->srx.srx_service << 16; trans->debug_id = atomic_inc_return(&rxrpc_debug_id); - - if (peer->srx.transport.family == AF_INET) { - switch (peer->srx.transport_type) { - case SOCK_DGRAM: - INIT_WORK(&trans->error_handler, - rxrpc_UDP_error_handler); - break; - default: - BUG(); - break; - } - } else { - BUG(); - } } _leave(" = %p", trans); @@ -210,8 +195,6 @@ static void rxrpc_cleanup_transport(struct rxrpc_transport *trans) { _net("DESTROY TRANS %d", trans->debug_id); - rxrpc_purge_queue(&trans->error_queue); - rxrpc_put_local(trans->local); rxrpc_put_peer(trans->peer); kfree(trans); -- cgit v1.2.3 From 875636163b4e694c092625ed98b17e10d582b3ca Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:34 +0100 Subject: rxrpc: Separate local endpoint event handling out into its own file Separate local endpoint event handling out into its own file preparatory to overhauling the object management aspect (which remains in the original file). Signed-off-by: David Howells --- net/rxrpc/Makefile | 1 + net/rxrpc/ar-internal.h | 5 ++ net/rxrpc/local_event.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++ net/rxrpc/local_object.c | 105 ++--------------------------------------- 4 files changed, 129 insertions(+), 102 deletions(-) create mode 100644 net/rxrpc/local_event.c (limited to 'net') diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index a6f6f21d8a59..b005027f80cf 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -12,6 +12,7 @@ af-rxrpc-y := \ input.o \ insecure.o \ key.o \ + local_event.o \ local_object.o \ misc.o \ output.o \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a63bb7518fb5..fa50b09eaa63 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -572,6 +572,11 @@ int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int); int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t, u32); +/* + * local_event.c + */ +extern void rxrpc_process_local_events(struct work_struct *); + /* * local_object.c */ diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c new file mode 100644 index 000000000000..194db2e6d548 --- /dev/null +++ b/net/rxrpc/local_event.c @@ -0,0 +1,120 @@ +/* AF_RXRPC local endpoint management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC"; + +/* + * Reply to a version request + */ +static void rxrpc_send_version_request(struct rxrpc_local *local, + struct rxrpc_host_header *hdr, + struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct sockaddr_in sin; + struct msghdr msg; + struct kvec iov[2]; + size_t len; + int ret; + + _enter(""); + + sin.sin_family = AF_INET; + sin.sin_port = udp_hdr(skb)->source; + sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + + msg.msg_name = &sin; + msg.msg_namelen = sizeof(sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.seq = 0; + whdr.serial = 0; + whdr.type = RXRPC_PACKET_TYPE_VERSION; + whdr.flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); + whdr.userStatus = 0; + whdr.securityIndex = 0; + whdr._rsvd = 0; + whdr.serviceId = htons(sp->hdr.serviceId); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = (char *)rxrpc_version_string; + iov[1].iov_len = sizeof(rxrpc_version_string); + + len = iov[0].iov_len + iov[1].iov_len; + + _proto("Tx VERSION (reply)"); + + ret = kernel_sendmsg(local->socket, &msg, iov, 2, len); + if (ret < 0) + _debug("sendmsg failed: %d", ret); + + _leave(""); +} + +/* + * Process event packets targetted at a local endpoint. + */ +void rxrpc_process_local_events(struct work_struct *work) +{ + struct rxrpc_local *local = container_of(work, struct rxrpc_local, event_processor); + struct sk_buff *skb; + char v; + + _enter(""); + + atomic_inc(&local->usage); + + while ((skb = skb_dequeue(&local->event_queue))) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + _debug("{%d},{%u}", local->debug_id, sp->hdr.type); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_VERSION: + if (skb_copy_bits(skb, 0, &v, 1) < 0) + return; + _proto("Rx VERSION { %02x }", v); + if (v == 0) + rxrpc_send_version_request(local, &sp->hdr, skb); + break; + + default: + /* Just ignore anything we don't understand */ + break; + } + + rxrpc_put_local(local); + rxrpc_free_skb(skb); + } + + rxrpc_put_local(local); + _leave(""); +} diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 28f9efb3118f..c1b8d745bf5e 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -1,12 +1,12 @@ -/* AF_RXRPC local endpoint management +/* Local endpoint object management * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License + * modify it under the terms of the GNU General Public Licence * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. + * 2 of the Licence, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -19,18 +19,14 @@ #include #include #include -#include #include "ar-internal.h" -static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC"; - static LIST_HEAD(rxrpc_locals); DEFINE_RWLOCK(rxrpc_local_lock); static DECLARE_RWSEM(rxrpc_local_sem); static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq); static void rxrpc_destroy_local(struct work_struct *work); -static void rxrpc_process_local_events(struct work_struct *work); /* * allocate a new local @@ -320,98 +316,3 @@ void __exit rxrpc_destroy_all_locals(void) _leave(""); } - -/* - * Reply to a version request - */ -static void rxrpc_send_version_request(struct rxrpc_local *local, - struct rxrpc_host_header *hdr, - struct sk_buff *skb) -{ - struct rxrpc_wire_header whdr; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct sockaddr_in sin; - struct msghdr msg; - struct kvec iov[2]; - size_t len; - int ret; - - _enter(""); - - sin.sin_family = AF_INET; - sin.sin_port = udp_hdr(skb)->source; - sin.sin_addr.s_addr = ip_hdr(skb)->saddr; - - msg.msg_name = &sin; - msg.msg_namelen = sizeof(sin); - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.seq = 0; - whdr.serial = 0; - whdr.type = RXRPC_PACKET_TYPE_VERSION; - whdr.flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); - whdr.userStatus = 0; - whdr.securityIndex = 0; - whdr._rsvd = 0; - whdr.serviceId = htons(sp->hdr.serviceId); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = (char *)rxrpc_version_string; - iov[1].iov_len = sizeof(rxrpc_version_string); - - len = iov[0].iov_len + iov[1].iov_len; - - _proto("Tx VERSION (reply)"); - - ret = kernel_sendmsg(local->socket, &msg, iov, 2, len); - if (ret < 0) - _debug("sendmsg failed: %d", ret); - - _leave(""); -} - -/* - * Process event packets targetted at a local endpoint. - */ -static void rxrpc_process_local_events(struct work_struct *work) -{ - struct rxrpc_local *local = container_of(work, struct rxrpc_local, event_processor); - struct sk_buff *skb; - char v; - - _enter(""); - - atomic_inc(&local->usage); - - while ((skb = skb_dequeue(&local->event_queue))) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - _debug("{%d},{%u}", local->debug_id, sp->hdr.type); - - switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_VERSION: - if (skb_copy_bits(skb, 0, &v, 1) < 0) - return; - _proto("Rx VERSION { %02x }", v); - if (v == 0) - rxrpc_send_version_request(local, &sp->hdr, skb); - break; - - default: - /* Just ignore anything we don't understand */ - break; - } - - rxrpc_put_local(local); - rxrpc_free_skb(skb); - } - - rxrpc_put_local(local); - _leave(""); -} -- cgit v1.2.3 From 4f95dd78a77edc42454de55bb32332be293fb461 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:35 +0100 Subject: rxrpc: Rework local endpoint management Rework the local RxRPC endpoint management. Local endpoint objects are maintained in a flat list as before. This should be okay as there shouldn't be more than one per open AF_RXRPC socket (there can be fewer as local endpoints can be shared if their local service ID is 0 and they share the same local transport parameters). Changes: (1) Local endpoints may now only be shared if they have local service ID 0 (ie. they're not being used for listening). This prevents a scenario where process A is listening of the Cache Manager port and process B contacts a fileserver - which may then attempt to send CM requests back to B. But if A and B are sharing a local endpoint, A will get the CM requests meant for B. (2) We use a mutex to handle lookups and don't provide RCU-only lookups since we only expect to access the list when opening a socket or destroying an endpoint. The local endpoint object is pointed to by the transport socket's sk_user_data for the life of the transport socket - allowing us to refer to it directly from the sk_data_ready and sk_error_report callbacks. (3) atomic_inc_not_zero() now exists and can be used to only share a local endpoint if the last reference hasn't yet gone. (4) We can remove rxrpc_local_lock - a spinlock that had to be taken with BH processing disabled given that we assume sk_user_data won't change under us. (5) The transport socket is shut down before we clear the sk_user_data pointer so that we can be sure that the transport socket's callbacks won't be invoked once the RCU destruction is scheduled. (6) Local endpoints have a work item that handles both destruction and event processing. The means that destruction doesn't then need to wait for event processing. The event queues can then be cleared after the transport socket is shut down. (7) Local endpoints are no longer available for resurrection beyond the life of the sockets that had them open. As soon as their last ref goes, they are scheduled for destruction and may not have their usage count moved from 0. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 19 ++- net/rxrpc/ar-internal.h | 55 ++++---- net/rxrpc/call_accept.c | 25 +--- net/rxrpc/conn_event.c | 15 +- net/rxrpc/input.c | 29 +--- net/rxrpc/local_event.c | 10 +- net/rxrpc/local_object.c | 353 ++++++++++++++++++++++++++++------------------- 7 files changed, 276 insertions(+), 230 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index ba373caddbeb..c83c3c75d665 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -102,6 +102,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, switch (srx->transport.family) { case AF_INET: + if (srx->transport_len < sizeof(struct sockaddr_in)) + return -EINVAL; _debug("INET: %x @ %pI4", ntohs(srx->transport.sin.sin_port), &srx->transport.sin.sin_addr); @@ -835,12 +837,27 @@ static void __exit af_rxrpc_exit(void) rxrpc_destroy_all_calls(); rxrpc_destroy_all_connections(); rxrpc_destroy_all_transports(); - rxrpc_destroy_all_locals(); ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0); + /* We need to flush the scheduled work twice because the local endpoint + * records involve a work item in their destruction as they can only be + * destroyed from process context. However, a connection may have a + * work item outstanding - and this will pin the local endpoint record + * until the connection goes away. + * + * Peers don't pin locals and calls pin sockets - which prevents the + * module from being unloaded - so we should only need two flushes. + */ _debug("flush scheduled work"); flush_workqueue(rxrpc_workqueue); + _debug("flush scheduled work 2"); + flush_workqueue(rxrpc_workqueue); + _debug("synchronise RCU"); + rcu_barrier(); + _debug("destroy locals"); + rxrpc_destroy_all_locals(); + remove_proc_entry("rxrpc_conns", init_net.proc_net); remove_proc_entry("rxrpc_calls", init_net.proc_net); destroy_workqueue(rxrpc_workqueue); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index fa50b09eaa63..c168268467cd 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -170,25 +170,26 @@ struct rxrpc_security { }; /* - * RxRPC local transport endpoint definition - * - matched by local port, address and protocol type + * RxRPC local transport endpoint description + * - owned by a single AF_RXRPC socket + * - pointed to by transport socket struct sk_user_data */ struct rxrpc_local { + struct rcu_head rcu; + atomic_t usage; + struct list_head link; struct socket *socket; /* my UDP socket */ - struct work_struct destroyer; /* endpoint destroyer */ - struct work_struct acceptor; /* incoming call processor */ - struct work_struct rejecter; /* packet reject writer */ - struct work_struct event_processor; /* endpoint event processor */ + struct work_struct processor; struct list_head services; /* services listening on this endpoint */ - struct list_head link; /* link in endpoint list */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ + struct mutex conn_lock; /* Client connection creation lock */ spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ - atomic_t usage; int debug_id; /* debug ID for printks */ + bool dead; struct sockaddr_rxrpc srx; /* local address */ }; @@ -487,7 +488,7 @@ extern struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_sock *, /* * call_accept.c */ -void rxrpc_accept_incoming_calls(struct work_struct *); +void rxrpc_accept_incoming_calls(struct rxrpc_local *); struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long); int rxrpc_reject_call(struct rxrpc_sock *); @@ -527,7 +528,7 @@ void __exit rxrpc_destroy_all_calls(void); */ void rxrpc_process_connection(struct work_struct *); void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *); -void rxrpc_reject_packets(struct work_struct *); +void rxrpc_reject_packets(struct rxrpc_local *); /* * conn_object.c @@ -575,17 +576,32 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t, /* * local_event.c */ -extern void rxrpc_process_local_events(struct work_struct *); +extern void rxrpc_process_local_events(struct rxrpc_local *); /* * local_object.c */ -extern rwlock_t rxrpc_local_lock; - -struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *); -void rxrpc_put_local(struct rxrpc_local *); +struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *); +void __rxrpc_put_local(struct rxrpc_local *); void __exit rxrpc_destroy_all_locals(void); +static inline void rxrpc_get_local(struct rxrpc_local *local) +{ + atomic_inc(&local->usage); +} + +static inline +struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) +{ + return atomic_inc_not_zero(&local->usage) ? local : NULL; +} + +static inline void rxrpc_put_local(struct rxrpc_local *local) +{ + if (atomic_dec_and_test(&local->usage)) + __rxrpc_put_local(local); +} + /* * misc.c */ @@ -874,15 +890,6 @@ static inline void rxrpc_purge_queue(struct sk_buff_head *list) rxrpc_free_skb(skb); } -static inline void __rxrpc_get_local(struct rxrpc_local *local, const char *f) -{ - CHECK_SLAB_OKAY(&local->usage); - if (atomic_inc_return(&local->usage) == 1) - printk("resurrected (%s)\n", f); -} - -#define rxrpc_get_local(LOCAL) __rxrpc_get_local((LOCAL), __func__) - #define rxrpc_get_call(CALL) \ do { \ CHECK_SLAB_OKAY(&(CALL)->usage); \ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index e5723f4dce89..50136c76ebd1 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -202,10 +202,8 @@ error_nofree: * accept incoming calls that need peer, transport and/or connection setting up * - the packets we get are all incoming client DATA packets that have seq == 1 */ -void rxrpc_accept_incoming_calls(struct work_struct *work) +void rxrpc_accept_incoming_calls(struct rxrpc_local *local) { - struct rxrpc_local *local = - container_of(work, struct rxrpc_local, acceptor); struct rxrpc_skb_priv *sp; struct sockaddr_rxrpc srx; struct rxrpc_sock *rx; @@ -215,21 +213,8 @@ void rxrpc_accept_incoming_calls(struct work_struct *work) _enter("%d", local->debug_id); - read_lock_bh(&rxrpc_local_lock); - if (atomic_read(&local->usage) > 0) - rxrpc_get_local(local); - else - local = NULL; - read_unlock_bh(&rxrpc_local_lock); - if (!local) { - _leave(" [local dead]"); - return; - } - -process_next_packet: skb = skb_dequeue(&local->accept_queue); if (!skb) { - rxrpc_put_local(local); _leave("\n"); return; } @@ -292,7 +277,7 @@ found_service: case -ECONNRESET: /* old calls are ignored */ case -ECONNABORTED: /* aborted calls are reaborted or ignored */ case 0: - goto process_next_packet; + return; case -ECONNREFUSED: goto invalid_service; case -EBUSY: @@ -308,18 +293,18 @@ backlog_full: busy: rxrpc_busy(local, &srx, &whdr); rxrpc_free_skb(skb); - goto process_next_packet; + return; invalid_service: skb->priority = RX_INVALID_OPERATION; rxrpc_reject_packet(local, skb); - goto process_next_packet; + return; /* can't change connection security type mid-flow */ security_mismatch: skb->priority = RX_PROTOCOL_ERROR; rxrpc_reject_packet(local, skb); - goto process_next_packet; + return; } /* diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 8bdd692d4862..00c92b614485 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -314,19 +314,14 @@ void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) { CHECK_SLAB_OKAY(&local->usage); - if (!atomic_inc_not_zero(&local->usage)) { - printk("resurrected on reject\n"); - BUG(); - } - skb_queue_tail(&local->reject_queue, skb); - rxrpc_queue_work(&local->rejecter); + rxrpc_queue_work(&local->processor); } /* * reject packets through the local endpoint */ -void rxrpc_reject_packets(struct work_struct *work) +void rxrpc_reject_packets(struct rxrpc_local *local) { union { struct sockaddr sa; @@ -334,16 +329,12 @@ void rxrpc_reject_packets(struct work_struct *work) } sa; struct rxrpc_skb_priv *sp; struct rxrpc_wire_header whdr; - struct rxrpc_local *local; struct sk_buff *skb; struct msghdr msg; struct kvec iov[2]; size_t size; __be32 code; - local = container_of(work, struct rxrpc_local, rejecter); - rxrpc_get_local(local); - _enter("%d", local->debug_id); iov[0].iov_base = &whdr; @@ -395,9 +386,7 @@ void rxrpc_reject_packets(struct work_struct *work) } rxrpc_free_skb(skb); - rxrpc_put_local(local); } - rxrpc_put_local(local); _leave(""); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 3b405dbf3a05..47fb167af3e4 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -594,9 +594,8 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, { _enter("%p,%p", local, skb); - atomic_inc(&local->usage); skb_queue_tail(&local->event_queue, skb); - rxrpc_queue_work(&local->event_processor); + rxrpc_queue_work(&local->processor); } /* @@ -664,11 +663,15 @@ cant_find_conn: /* * handle data received on the local endpoint * - may be called in interrupt context + * + * The socket is locked by the caller and this prevents the socket from being + * shut down and the local endpoint from going away, thus sk_user_data will not + * be cleared until this function returns. */ void rxrpc_data_ready(struct sock *sk) { struct rxrpc_skb_priv *sp; - struct rxrpc_local *local; + struct rxrpc_local *local = sk->sk_user_data; struct sk_buff *skb; int ret; @@ -676,21 +679,8 @@ void rxrpc_data_ready(struct sock *sk) ASSERT(!irqs_disabled()); - read_lock_bh(&rxrpc_local_lock); - local = sk->sk_user_data; - if (local && atomic_read(&local->usage) > 0) - rxrpc_get_local(local); - else - local = NULL; - read_unlock_bh(&rxrpc_local_lock); - if (!local) { - _leave(" [local dead]"); - return; - } - skb = skb_recv_datagram(sk, 0, 1, &ret); if (!skb) { - rxrpc_put_local(local); if (ret == -EAGAIN) return; _debug("UDP socket error %d", ret); @@ -704,7 +694,6 @@ void rxrpc_data_ready(struct sock *sk) /* we'll probably need to checksum it (didn't call sock_recvmsg) */ if (skb_checksum_complete(skb)) { rxrpc_free_skb(skb); - rxrpc_put_local(local); __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0); _leave(" [CSUM failed]"); return; @@ -769,7 +758,6 @@ void rxrpc_data_ready(struct sock *sk) } out: - rxrpc_put_local(local); return; cant_route_call: @@ -779,8 +767,7 @@ cant_route_call: if (sp->hdr.seq == 1) { _debug("first packet"); skb_queue_tail(&local->accept_queue, skb); - rxrpc_queue_work(&local->acceptor); - rxrpc_put_local(local); + rxrpc_queue_work(&local->processor); _leave(" [incoming]"); return; } @@ -793,13 +780,11 @@ cant_route_call: _debug("reject type %d",sp->hdr.type); rxrpc_reject_packet(local, skb); } - rxrpc_put_local(local); _leave(" [no call]"); return; bad_message: skb->priority = RX_PROTOCOL_ERROR; rxrpc_reject_packet(local, skb); - rxrpc_put_local(local); _leave(" [badmsg]"); } diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 194db2e6d548..31a3f86ef2f6 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -82,17 +82,15 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, /* * Process event packets targetted at a local endpoint. */ -void rxrpc_process_local_events(struct work_struct *work) +void rxrpc_process_local_events(struct rxrpc_local *local) { - struct rxrpc_local *local = container_of(work, struct rxrpc_local, event_processor); struct sk_buff *skb; char v; _enter(""); - atomic_inc(&local->usage); - - while ((skb = skb_dequeue(&local->event_queue))) { + skb = skb_dequeue(&local->event_queue); + if (skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); _debug("{%d},{%u}", local->debug_id, sp->hdr.type); @@ -111,10 +109,8 @@ void rxrpc_process_local_events(struct work_struct *work) break; } - rxrpc_put_local(local); rxrpc_free_skb(skb); } - rxrpc_put_local(local); _leave(""); } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index c1b8d745bf5e..009b321712bc 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -1,6 +1,6 @@ /* Local endpoint object management * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -17,40 +17,72 @@ #include #include #include +#include #include #include #include "ar-internal.h" -static LIST_HEAD(rxrpc_locals); -DEFINE_RWLOCK(rxrpc_local_lock); -static DECLARE_RWSEM(rxrpc_local_sem); -static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq); +static void rxrpc_local_processor(struct work_struct *); +static void rxrpc_local_rcu(struct rcu_head *); -static void rxrpc_destroy_local(struct work_struct *work); +static DEFINE_MUTEX(rxrpc_local_mutex); +static LIST_HEAD(rxrpc_local_endpoints); /* - * allocate a new local + * Compare a local to an address. Return -ve, 0 or +ve to indicate less than, + * same or greater than. + * + * We explicitly don't compare the RxRPC service ID as we want to reject + * conflicting uses by differing services. Further, we don't want to share + * addresses with different options (IPv6), so we don't compare those bits + * either. */ -static -struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx) +static long rxrpc_local_cmp_key(const struct rxrpc_local *local, + const struct sockaddr_rxrpc *srx) +{ + long diff; + + diff = ((local->srx.transport_type - srx->transport_type) ?: + (local->srx.transport_len - srx->transport_len) ?: + (local->srx.transport.family - srx->transport.family)); + if (diff != 0) + return diff; + + switch (srx->transport.family) { + case AF_INET: + /* If the choice of UDP port is left up to the transport, then + * the endpoint record doesn't match. + */ + return ((u16 __force)local->srx.transport.sin.sin_port - + (u16 __force)srx->transport.sin.sin_port) ?: + memcmp(&local->srx.transport.sin.sin_addr, + &srx->transport.sin.sin_addr, + sizeof(struct in_addr)); + default: + BUG(); + } +} + +/* + * Allocate a new local endpoint. + */ +static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); if (local) { - INIT_WORK(&local->destroyer, &rxrpc_destroy_local); - INIT_WORK(&local->acceptor, &rxrpc_accept_incoming_calls); - INIT_WORK(&local->rejecter, &rxrpc_reject_packets); - INIT_WORK(&local->event_processor, &rxrpc_process_local_events); - INIT_LIST_HEAD(&local->services); + atomic_set(&local->usage, 1); INIT_LIST_HEAD(&local->link); + INIT_WORK(&local->processor, rxrpc_local_processor); + INIT_LIST_HEAD(&local->services); init_rwsem(&local->defrag_sem); skb_queue_head_init(&local->accept_queue); skb_queue_head_init(&local->reject_queue); skb_queue_head_init(&local->event_queue); + mutex_init(&local->conn_lock); spin_lock_init(&local->lock); rwlock_init(&local->services_lock); - atomic_set(&local->usage, 1); local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); } @@ -61,9 +93,9 @@ struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx) /* * create the local socket - * - must be called with rxrpc_local_sem writelocked + * - must be called with rxrpc_local_mutex locked */ -static int rxrpc_create_local(struct rxrpc_local *local) +static int rxrpc_open_socket(struct rxrpc_local *local) { struct sock *sock; int ret, opt; @@ -82,10 +114,10 @@ static int rxrpc_create_local(struct rxrpc_local *local) if (local->srx.transport_len > sizeof(sa_family_t)) { _debug("bind"); ret = kernel_bind(local->socket, - (struct sockaddr *) &local->srx.transport, + (struct sockaddr *)&local->srx.transport, local->srx.transport_len); if (ret < 0) { - _debug("bind failed"); + _debug("bind failed %d", ret); goto error; } } @@ -108,10 +140,6 @@ static int rxrpc_create_local(struct rxrpc_local *local) goto error; } - write_lock_bh(&rxrpc_local_lock); - list_add(&local->link, &rxrpc_locals); - write_unlock_bh(&rxrpc_local_lock); - /* set the socket up */ sock = local->socket->sk; sock->sk_user_data = local; @@ -131,188 +159,227 @@ error: } /* - * create a new local endpoint using the specified UDP address + * Look up or create a new local endpoint using the specified local address. */ -struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *srx) +struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; + struct list_head *cursor; + const char *age; + long diff; int ret; - _enter("{%d,%u,%pI4+%hu}", - srx->transport_type, - srx->transport.family, - &srx->transport.sin.sin_addr, - ntohs(srx->transport.sin.sin_port)); - - down_write(&rxrpc_local_sem); + if (srx->transport.family == AF_INET) { + _enter("{%d,%u,%pI4+%hu}", + srx->transport_type, + srx->transport.family, + &srx->transport.sin.sin_addr, + ntohs(srx->transport.sin.sin_port)); + } else { + _enter("{%d,%u}", + srx->transport_type, + srx->transport.family); + return ERR_PTR(-EAFNOSUPPORT); + } - /* see if we have a suitable local local endpoint already */ - read_lock_bh(&rxrpc_local_lock); + mutex_lock(&rxrpc_local_mutex); - list_for_each_entry(local, &rxrpc_locals, link) { - _debug("CMP {%d,%u,%pI4+%hu}", - local->srx.transport_type, - local->srx.transport.family, - &local->srx.transport.sin.sin_addr, - ntohs(local->srx.transport.sin.sin_port)); + for (cursor = rxrpc_local_endpoints.next; + cursor != &rxrpc_local_endpoints; + cursor = cursor->next) { + local = list_entry(cursor, struct rxrpc_local, link); - if (local->srx.transport_type != srx->transport_type || - local->srx.transport.family != srx->transport.family) + diff = rxrpc_local_cmp_key(local, srx); + if (diff < 0) continue; + if (diff > 0) + break; + + /* Services aren't allowed to share transport sockets, so + * reject that here. It is possible that the object is dying - + * but it may also still have the local transport address that + * we want bound. + */ + if (srx->srx_service) { + local = NULL; + goto addr_in_use; + } - switch (srx->transport.family) { - case AF_INET: - if (local->srx.transport.sin.sin_port != - srx->transport.sin.sin_port) - continue; - if (memcmp(&local->srx.transport.sin.sin_addr, - &srx->transport.sin.sin_addr, - sizeof(struct in_addr)) != 0) - continue; - goto found_local; - - default: - BUG(); + /* Found a match. We replace a dying object. Attempting to + * bind the transport socket may still fail if we're attempting + * to use a local address that the dying object is still using. + */ + if (!atomic_inc_not_zero(&local->usage)) { + cursor = cursor->next; + list_del_init(&local->link); + break; } - } - read_unlock_bh(&rxrpc_local_lock); + age = "old"; + goto found; + } - /* we didn't find one, so we need to create one */ local = rxrpc_alloc_local(srx); - if (!local) { - up_write(&rxrpc_local_sem); - return ERR_PTR(-ENOMEM); - } + if (!local) + goto nomem; - ret = rxrpc_create_local(local); - if (ret < 0) { - up_write(&rxrpc_local_sem); - kfree(local); - _leave(" = %d", ret); - return ERR_PTR(ret); - } + ret = rxrpc_open_socket(local); + if (ret < 0) + goto sock_error; + + list_add_tail(&local->link, cursor); + age = "new"; - up_write(&rxrpc_local_sem); +found: + mutex_unlock(&rxrpc_local_mutex); - _net("LOCAL new %d {%d,%u,%pI4+%hu}", + _net("LOCAL %s %d {%d,%u,%pI4+%hu}", + age, local->debug_id, local->srx.transport_type, local->srx.transport.family, &local->srx.transport.sin.sin_addr, ntohs(local->srx.transport.sin.sin_port)); - _leave(" = %p [new]", local); + _leave(" = %p", local); return local; -found_local: - rxrpc_get_local(local); - read_unlock_bh(&rxrpc_local_lock); - up_write(&rxrpc_local_sem); +nomem: + ret = -ENOMEM; +sock_error: + mutex_unlock(&rxrpc_local_mutex); + kfree(local); + _leave(" = %d", ret); + return ERR_PTR(ret); - _net("LOCAL old %d {%d,%u,%pI4+%hu}", - local->debug_id, - local->srx.transport_type, - local->srx.transport.family, - &local->srx.transport.sin.sin_addr, - ntohs(local->srx.transport.sin.sin_port)); +addr_in_use: + mutex_unlock(&rxrpc_local_mutex); + _leave(" = -EADDRINUSE"); + return ERR_PTR(-EADDRINUSE); +} - _leave(" = %p [reuse]", local); - return local; +/* + * A local endpoint reached its end of life. + */ +void __rxrpc_put_local(struct rxrpc_local *local) +{ + _enter("%d", local->debug_id); + rxrpc_queue_work(&local->processor); } /* - * release a local endpoint + * Destroy a local endpoint's socket and then hand the record to RCU to dispose + * of. + * + * Closing the socket cannot be done from bottom half context or RCU callback + * context because it might sleep. */ -void rxrpc_put_local(struct rxrpc_local *local) +static void rxrpc_local_destroyer(struct rxrpc_local *local) { - _enter("%p{u=%d}", local, atomic_read(&local->usage)); + struct socket *socket = local->socket; - ASSERTCMP(atomic_read(&local->usage), >, 0); + _enter("%d", local->debug_id); - /* to prevent a race, the decrement and the dequeue must be effectively - * atomic */ - write_lock_bh(&rxrpc_local_lock); - if (unlikely(atomic_dec_and_test(&local->usage))) { - _debug("destroy local"); - rxrpc_queue_work(&local->destroyer); + /* We can get a race between an incoming call packet queueing the + * processor again and the work processor starting the destruction + * process which will shut down the UDP socket. + */ + if (local->dead) { + _leave(" [already dead]"); + return; } - write_unlock_bh(&rxrpc_local_lock); - _leave(""); + local->dead = true; + + mutex_lock(&rxrpc_local_mutex); + list_del_init(&local->link); + mutex_unlock(&rxrpc_local_mutex); + + ASSERT(list_empty(&local->services)); + + if (socket) { + local->socket = NULL; + kernel_sock_shutdown(socket, SHUT_RDWR); + socket->sk->sk_user_data = NULL; + sock_release(socket); + } + + /* At this point, there should be no more packets coming in to the + * local endpoint. + */ + rxrpc_purge_queue(&local->accept_queue); + rxrpc_purge_queue(&local->reject_queue); + rxrpc_purge_queue(&local->event_queue); + + _debug("rcu local %d", local->debug_id); + call_rcu(&local->rcu, rxrpc_local_rcu); } /* - * destroy a local endpoint + * Process events on an endpoint */ -static void rxrpc_destroy_local(struct work_struct *work) +static void rxrpc_local_processor(struct work_struct *work) { struct rxrpc_local *local = - container_of(work, struct rxrpc_local, destroyer); + container_of(work, struct rxrpc_local, processor); + bool again; - _enter("%p{%d}", local, atomic_read(&local->usage)); + _enter("%d", local->debug_id); - down_write(&rxrpc_local_sem); + do { + again = false; + if (atomic_read(&local->usage) == 0) + return rxrpc_local_destroyer(local); - write_lock_bh(&rxrpc_local_lock); - if (atomic_read(&local->usage) > 0) { - write_unlock_bh(&rxrpc_local_lock); - up_read(&rxrpc_local_sem); - _leave(" [resurrected]"); - return; - } + if (!skb_queue_empty(&local->accept_queue)) { + rxrpc_accept_incoming_calls(local); + again = true; + } - list_del(&local->link); - local->socket->sk->sk_user_data = NULL; - write_unlock_bh(&rxrpc_local_lock); + if (!skb_queue_empty(&local->reject_queue)) { + rxrpc_reject_packets(local); + again = true; + } - downgrade_write(&rxrpc_local_sem); + if (!skb_queue_empty(&local->event_queue)) { + rxrpc_process_local_events(local); + again = true; + } + } while (again); +} - ASSERT(list_empty(&local->services)); - ASSERT(!work_pending(&local->acceptor)); - ASSERT(!work_pending(&local->rejecter)); - ASSERT(!work_pending(&local->event_processor)); +/* + * Destroy a local endpoint after the RCU grace period expires. + */ +static void rxrpc_local_rcu(struct rcu_head *rcu) +{ + struct rxrpc_local *local = container_of(rcu, struct rxrpc_local, rcu); - /* finish cleaning up the local descriptor */ - rxrpc_purge_queue(&local->accept_queue); - rxrpc_purge_queue(&local->reject_queue); - rxrpc_purge_queue(&local->event_queue); - kernel_sock_shutdown(local->socket, SHUT_RDWR); - sock_release(local->socket); + _enter("%d", local->debug_id); - up_read(&rxrpc_local_sem); + ASSERT(!work_pending(&local->processor)); _net("DESTROY LOCAL %d", local->debug_id); kfree(local); - - if (list_empty(&rxrpc_locals)) - wake_up_all(&rxrpc_local_wq); - _leave(""); } /* - * preemptively destroy all local local endpoint rather than waiting for - * them to be destroyed + * Verify the local endpoint list is empty by this point. */ void __exit rxrpc_destroy_all_locals(void) { - DECLARE_WAITQUEUE(myself,current); + struct rxrpc_local *local; _enter(""); - /* we simply have to wait for them to go away */ - if (!list_empty(&rxrpc_locals)) { - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&rxrpc_local_wq, &myself); - - while (!list_empty(&rxrpc_locals)) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } + if (list_empty(&rxrpc_local_endpoints)) + return; - remove_wait_queue(&rxrpc_local_wq, &myself); - set_current_state(TASK_RUNNING); + mutex_lock(&rxrpc_local_mutex); + list_for_each_entry(local, &rxrpc_local_endpoints, link) { + pr_err("AF_RXRPC: Leaked local %p {%d}\n", + local, atomic_read(&local->usage)); } - - _leave(""); + mutex_unlock(&rxrpc_local_mutex); + BUG(); } -- cgit v1.2.3 From e53743994e21d2458f0129d07b253d66f96f5742 Mon Sep 17 00:00:00 2001 From: Eugene Crosser Date: Mon, 13 Jun 2016 18:46:14 +0200 Subject: af_iucv: use paged SKBs for big outbound messages When an outbound message is bigger than a page, allocate and fill a paged SKB, and subsequently use IUCV send primitive with IPBUFLST flag. This relaxes the pressure to allocate big contiguous kernel buffers. Signed-off-by: Eugene Crosser Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 124 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index fc3598a922b0..38448d17c006 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1033,6 +1033,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); + size_t headroom, linear; struct sk_buff *skb; struct iucv_message txmsg = {0}; struct cmsghdr *cmsg; @@ -1110,20 +1111,31 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, * this is fine for SOCK_SEQPACKET (unless we want to support * segmented records using the MSG_EOR flag), but * for SOCK_STREAM we might want to improve it in future */ - if (iucv->transport == AF_IUCV_TRANS_HIPER) - skb = sock_alloc_send_skb(sk, - len + sizeof(struct af_iucv_trans_hdr) + ETH_HLEN, - noblock, &err); - else - skb = sock_alloc_send_skb(sk, len, noblock, &err); + headroom = (iucv->transport == AF_IUCV_TRANS_HIPER) + ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0; + if (headroom + len < PAGE_SIZE) { + linear = len; + } else { + /* In nonlinear "classic" iucv skb, + * reserve space for iucv_array + */ + if (iucv->transport != AF_IUCV_TRANS_HIPER) + headroom += sizeof(struct iucv_array) * + (MAX_SKB_FRAGS + 1); + linear = PAGE_SIZE - headroom; + } + skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear, + noblock, &err, 0); if (!skb) goto out; - if (iucv->transport == AF_IUCV_TRANS_HIPER) - skb_reserve(skb, sizeof(struct af_iucv_trans_hdr) + ETH_HLEN); - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - err = -EFAULT; + if (headroom) + skb_reserve(skb, headroom); + skb_put(skb, linear); + skb->len = len; + skb->data_len = len - linear; + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); + if (err) goto fail; - } /* wait if outstanding messages for iucv path has reached */ timeo = sock_sndtimeo(sk, noblock); @@ -1148,49 +1160,67 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, atomic_dec(&iucv->msg_sent); goto fail; } - goto release; - } - skb_queue_tail(&iucv->send_skb_q, skb); - - if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) - && skb->len <= 7) { - err = iucv_send_iprm(iucv->path, &txmsg, skb); + } else { /* Classic VM IUCV transport */ + skb_queue_tail(&iucv->send_skb_q, skb); + + if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) && + skb->len <= 7) { + err = iucv_send_iprm(iucv->path, &txmsg, skb); + + /* on success: there is no message_complete callback */ + /* for an IPRMDATA msg; remove skb from send queue */ + if (err == 0) { + skb_unlink(skb, &iucv->send_skb_q); + kfree_skb(skb); + } - /* on success: there is no message_complete callback - * for an IPRMDATA msg; remove skb from send queue */ - if (err == 0) { - skb_unlink(skb, &iucv->send_skb_q); - kfree_skb(skb); + /* this error should never happen since the */ + /* IUCV_IPRMDATA path flag is set... sever path */ + if (err == 0x15) { + pr_iucv->path_sever(iucv->path, NULL); + skb_unlink(skb, &iucv->send_skb_q); + err = -EPIPE; + goto fail; + } + } else if (skb_is_nonlinear(skb)) { + struct iucv_array *iba = (struct iucv_array *)skb->head; + int i; + + /* skip iucv_array lying in the headroom */ + iba[0].address = (u32)(addr_t)skb->data; + iba[0].length = (u32)skb_headlen(skb); + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + iba[i + 1].address = + (u32)(addr_t)skb_frag_address(frag); + iba[i + 1].length = (u32)skb_frag_size(frag); + } + err = pr_iucv->message_send(iucv->path, &txmsg, + IUCV_IPBUFLST, 0, + (void *)iba, skb->len); + } else { /* non-IPRM Linear skb */ + err = pr_iucv->message_send(iucv->path, &txmsg, + 0, 0, (void *)skb->data, skb->len); } - - /* this error should never happen since the - * IUCV_IPRMDATA path flag is set... sever path */ - if (err == 0x15) { - pr_iucv->path_sever(iucv->path, NULL); + if (err) { + if (err == 3) { + user_id[8] = 0; + memcpy(user_id, iucv->dst_user_id, 8); + appl_id[8] = 0; + memcpy(appl_id, iucv->dst_name, 8); + pr_err( + "Application %s on z/VM guest %s exceeds message limit\n", + appl_id, user_id); + err = -EAGAIN; + } else { + err = -EPIPE; + } skb_unlink(skb, &iucv->send_skb_q); - err = -EPIPE; goto fail; } - } else - err = pr_iucv->message_send(iucv->path, &txmsg, 0, 0, - (void *) skb->data, skb->len); - if (err) { - if (err == 3) { - user_id[8] = 0; - memcpy(user_id, iucv->dst_user_id, 8); - appl_id[8] = 0; - memcpy(appl_id, iucv->dst_name, 8); - pr_err("Application %s on z/VM guest %s" - " exceeds message limit\n", - appl_id, user_id); - err = -EAGAIN; - } else - err = -EPIPE; - skb_unlink(skb, &iucv->send_skb_q); - goto fail; } -release: release_sock(sk); return len; -- cgit v1.2.3 From 291759a57532b7940b6e52c54ceebd6b8d9e113e Mon Sep 17 00:00:00 2001 From: Eugene Crosser Date: Mon, 13 Jun 2016 18:46:15 +0200 Subject: af_iucv: remove fragment_skb() to use paged SKBs Before introducing paged skbs in the receive path, get rid of the function `iucv_fragment_skb()` that replaces one large linear skb with several smaller linear skbs. Signed-off-by: Eugene Crosser Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 59 +++--------------------------------------------------- 1 file changed, 3 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 38448d17c006..9ed2adf9e057 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1231,44 +1231,6 @@ out: return err; } -/* iucv_fragment_skb() - Fragment a single IUCV message into multiple skb's - * - * Locking: must be called with message_q.lock held - */ -static int iucv_fragment_skb(struct sock *sk, struct sk_buff *skb, int len) -{ - int dataleft, size, copied = 0; - struct sk_buff *nskb; - - dataleft = len; - while (dataleft) { - if (dataleft >= sk->sk_rcvbuf / 4) - size = sk->sk_rcvbuf / 4; - else - size = dataleft; - - nskb = alloc_skb(size, GFP_ATOMIC | GFP_DMA); - if (!nskb) - return -ENOMEM; - - /* copy target class to control buffer of new skb */ - IUCV_SKB_CB(nskb)->class = IUCV_SKB_CB(skb)->class; - - /* copy data fragment */ - memcpy(nskb->data, skb->data + copied, size); - copied += size; - dataleft -= size; - - skb_reset_transport_header(nskb); - skb_reset_network_header(nskb); - nskb->len = size; - - skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, nskb); - } - - return 0; -} - /* iucv_process_message() - Receive a single outstanding IUCV message * * Locking: must be called with message_q.lock held @@ -1300,24 +1262,9 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, kfree_skb(skb); return; } - /* we need to fragment iucv messages for SOCK_STREAM only; - * for SOCK_SEQPACKET, it is only relevant if we support - * record segmentation using MSG_EOR (see also recvmsg()) */ - if (sk->sk_type == SOCK_STREAM && - skb->truesize >= sk->sk_rcvbuf / 4) { - rc = iucv_fragment_skb(sk, skb, len); - kfree_skb(skb); - skb = NULL; - if (rc) { - pr_iucv->path_sever(path, NULL); - return; - } - skb = skb_dequeue(&iucv_sk(sk)->backlog_skb_q); - } else { - skb_reset_transport_header(skb); - skb_reset_network_header(skb); - skb->len = len; - } + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + skb->len = len; } IUCV_SKB_CB(skb)->offset = 0; -- cgit v1.2.3 From a006353a9a8d9e28d35f94bfc97e9573d6ee28aa Mon Sep 17 00:00:00 2001 From: Eugene Crosser Date: Mon, 13 Jun 2016 18:46:16 +0200 Subject: af_iucv: use paged SKBs for big inbound messages When an inbound message is bigger than a page, allocate a paged SKB, and subsequently use IUCV receive primitive with IPBUFLST flag. This relaxes the pressure to allocate big contiguous kernel buffers. Signed-off-by: Eugene Crosser Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 9ed2adf9e057..37d674e6f8a9 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1231,6 +1231,34 @@ out: return err; } +static struct sk_buff *alloc_iucv_recv_skb(unsigned long len) +{ + size_t headroom, linear; + struct sk_buff *skb; + int err; + + if (len < PAGE_SIZE) { + headroom = 0; + linear = len; + } else { + headroom = sizeof(struct iucv_array) * (MAX_SKB_FRAGS + 1); + linear = PAGE_SIZE - headroom; + } + skb = alloc_skb_with_frags(headroom + linear, len - linear, + 0, &err, GFP_ATOMIC | GFP_DMA); + WARN_ONCE(!skb, + "alloc of recv iucv skb len=%lu failed with errcode=%d\n", + len, err); + if (skb) { + if (headroom) + skb_reserve(skb, headroom); + skb_put(skb, linear); + skb->len = len; + skb->data_len = len - linear; + } + return skb; +} + /* iucv_process_message() - Receive a single outstanding IUCV message * * Locking: must be called with message_q.lock held @@ -1255,16 +1283,32 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, skb->len = 0; } } else { - rc = pr_iucv->message_receive(path, msg, + if (skb_is_nonlinear(skb)) { + struct iucv_array *iba = (struct iucv_array *)skb->head; + int i; + + iba[0].address = (u32)(addr_t)skb->data; + iba[0].length = (u32)skb_headlen(skb); + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + iba[i + 1].address = + (u32)(addr_t)skb_frag_address(frag); + iba[i + 1].length = (u32)skb_frag_size(frag); + } + rc = pr_iucv->message_receive(path, msg, + IUCV_IPBUFLST, + (void *)iba, len, NULL); + } else { + rc = pr_iucv->message_receive(path, msg, msg->flags & IUCV_IPRMDATA, skb->data, len, NULL); + } if (rc) { kfree_skb(skb); return; } - skb_reset_transport_header(skb); - skb_reset_network_header(skb); - skb->len = len; + WARN_ON_ONCE(skb->len != len); } IUCV_SKB_CB(skb)->offset = 0; @@ -1283,7 +1327,7 @@ static void iucv_process_message_q(struct sock *sk) struct sock_msg_q *p, *n; list_for_each_entry_safe(p, n, &iucv->message_q.list, list) { - skb = alloc_skb(iucv_msg_length(&p->msg), GFP_ATOMIC | GFP_DMA); + skb = alloc_iucv_recv_skb(iucv_msg_length(&p->msg)); if (!skb) break; iucv_process_message(sk, skb, p->path, &p->msg); @@ -1778,7 +1822,7 @@ static void iucv_callback_rx(struct iucv_path *path, struct iucv_message *msg) if (len > sk->sk_rcvbuf) goto save_message; - skb = alloc_skb(iucv_msg_length(msg), GFP_ATOMIC | GFP_DMA); + skb = alloc_iucv_recv_skb(iucv_msg_length(msg)); if (!skb) goto save_message; -- cgit v1.2.3 From cd2a9e62c8a3c5cae7691982667d79a0edc65283 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 13 Jun 2016 13:44:17 -0700 Subject: net: l3mdev: Remove const from flowi6 arg to get_rt6_dst Allow drivers to pass flow arg to functions where the arg is not const and allow the driver to make updates as needed (eg., setting oif). Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/l3mdev/l3mdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 7da97809a7e8..d90e4ef09e85 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); */ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, - const struct flowi6 *fl6) + struct flowi6 *fl6) { struct dst_entry *dst = NULL; struct net_device *dev; -- cgit v1.2.3 From ba46ee4c0ed122fa14aa2f5d6994c166a01ae2c0 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 13 Jun 2016 13:44:18 -0700 Subject: net: ipv6: Do not add multicast route for l3 master devices L3 master devices are virtual devices similar to the loopback device. Link local and multicast routes for these devices do not make sense. The ipv6 addrconf code already skips adding a linklocal address; do the same for the mcast route. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 47f837a58e0a..b12553905e42 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2254,7 +2254,7 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev) return ERR_PTR(-EACCES); /* Add default multicast route */ - if (!(dev->flags & IFF_LOOPBACK)) + if (!(dev->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev)) addrconf_add_mroute(dev); return idev; -- cgit v1.2.3 From 9ff74384600aeecba34ebdacbbde0627489ff601 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 13 Jun 2016 13:44:19 -0700 Subject: net: vrf: Handle ipv6 multicast and link-local addresses IPv6 multicast and link-local addresses require special handling by the VRF driver: 1. Rather than using the VRF device index and full FIB lookups, packets to/from these addresses should use direct FIB lookups based on the VRF device table. 2. fail sends/receives on a VRF device to/from a multicast address (e.g, make ping6 ff02::1% fail) 3. move the setting of the flow oif to the first dst lookup and revert the change in icmpv6_echo_reply made in ca254490c8dfd ("net: Add VRF support to IPv6 stack"). Linklocal/mcast addresses require use of the skb->dev. With this change connections into and out of a VRF enslaved device work for multicast and link-local addresses work (icmp, tcp, and udp) e.g., 1. packets into VM with VRF config: ping6 -c3 fe80::e0:f9ff:fe1c:b974%br1 ping6 -c3 ff02::1%br1 ssh -6 fe80::e0:f9ff:fe1c:b974%br1 2. packets going out a VRF enslaved device: ping6 -c3 fe80::18f8:83ff:fe4b:7a2e%eth1 ping6 -c3 ff02::1%eth1 ssh -6 root@fe80::18f8:83ff:fe4b:7a2e%eth1 Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 2 +- net/ipv6/route.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 40454bfb534e..e32a72fb9982 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -587,7 +587,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) fl6.saddr = *saddr; - fl6.flowi6_oif = l3mdev_fib_oif(skb->dev); + fl6.flowi6_oif = skb->dev->ifindex; fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; fl6.flowi6_mark = mark; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c6ae6f9b5fe3..d51a1a48b839 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1042,8 +1042,8 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return pcpu_rt; } -static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, - struct flowi6 *fl6, int flags) +struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; struct rt6_info *rt; @@ -1139,6 +1139,7 @@ redo_rt6_select: } } +EXPORT_SYMBOL_GPL(ip6_pol_route); static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) -- cgit v1.2.3 From b2313077ed0db35ee186905d8076a737248edd24 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 13 Jun 2016 13:46:28 -0700 Subject: net_sched: make tcf_hash_check() boolean Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_api.c | 8 ++++---- net/sched/act_ife.c | 3 ++- net/sched/act_ipt.c | 3 ++- net/sched/act_mirred.c | 3 ++- net/sched/act_simple.c | 3 ++- net/sched/act_skbedit.c | 3 ++- net/sched/act_vlan.c | 4 ++-- 7 files changed, 16 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index b6db56ec8117..f8c61d2a7963 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -224,8 +224,8 @@ int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index) } EXPORT_SYMBOL(tcf_hash_search); -int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, - int bind) +bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, + int bind) { struct tcf_hashinfo *hinfo = tn->hinfo; struct tcf_common *p = NULL; @@ -235,9 +235,9 @@ int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, p->tcfc_refcnt++; a->priv = p; a->hinfo = hinfo; - return 1; + return true; } - return 0; + return false; } EXPORT_SYMBOL(tcf_hash_check); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 02f5a8ba95d7..b7fa96926c90 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -423,7 +423,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, u16 ife_type = 0; u8 *daddr = NULL; u8 *saddr = NULL; - int ret = 0, exists = 0; + bool exists = false; + int ret = 0; int err; err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 8998a3594e86..6148e323ed93 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -97,7 +97,8 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, struct tcf_ipt *ipt; struct xt_entry_target *td, *t; char *tname; - int ret = 0, err, exists = 0; + bool exists = false; + int ret = 0, err; u32 hook = 0; u32 index = 0; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 787751a7981a..5b135d357e1e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -62,7 +62,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct tc_mirred *parm; struct tcf_mirred *m; struct net_device *dev; - int ret, ok_push = 0, exists = 0; + int ret, ok_push = 0; + bool exists = false; if (nla == NULL) return -EINVAL; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index be5fbb51cfed..318328d34d12 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -86,8 +86,9 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; struct tcf_defact *d; + bool exists = false; + int ret = 0, err; char *defdata; - int ret = 0, err, exists = 0; if (nla == NULL) return -EINVAL; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 7e2bc3c2b6da..53d1486cddf7 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -69,7 +69,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL; u16 *queue_mapping = NULL; - int ret = 0, err, exists = 0; + bool exists = false; + int ret = 0, err; if (nla == NULL) return -EINVAL; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index b075d50e0fc3..db9b7ed570ba 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -77,8 +77,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, int action; __be16 push_vid = 0; __be16 push_proto = 0; - int ret = 0, exists = 0; - int err; + bool exists = false; + int ret = 0, err; if (!nla) return -EINVAL; -- cgit v1.2.3 From 35c55c9877f8de0ab129fa1a309271d0ecc868b9 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 13 Jun 2016 20:46:22 -0400 Subject: tipc: add neighbor monitoring framework TIPC based clusters are by default set up with full-mesh link connectivity between all nodes. Those links are expected to provide a short failure detection time, by default set to 1500 ms. Because of this, the background load for neighbor monitoring in an N-node cluster increases with a factor N on each node, while the overall monitoring traffic through the network infrastructure increases at a ~(N * (N - 1)) rate. Experience has shown that such clusters don't scale well beyond ~100 nodes unless we significantly increase failure discovery tolerance. This commit introduces a framework and an algorithm that drastically reduces this background load, while basically maintaining the original failure detection times across the whole cluster. Using this algorithm, background load will now grow at a rate of ~(2 * sqrt(N)) per node, and at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will now have to actively monitor 38 neighbors in a 400-node cluster, instead of as before 399. This "Overlapping Ring Supervision Algorithm" is completely distributed and employs no centralized or coordinated state. It goes as follows: - Each node makes up a linearly ascending, circular list of all its N known neighbors, based on their TIPC node identity. This algorithm must be the same on all nodes. - The node then selects the next M = sqrt(N) - 1 nodes downstream from itself in the list, and chooses to actively monitor those. This is called its "local monitoring domain". - It creates a domain record describing the monitoring domain, and piggy-backs this in the data area of all neighbor monitoring messages (LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in the cluster eventually (default within 400 ms) will learn about its monitoring domain. - Whenever a node discovers a change in its local domain, e.g., a node has been added or has gone down, it creates and sends out a new version of its node record to inform all neighbors about the change. - A node receiving a domain record from anybody outside its local domain matches this against its own list (which may not look the same), and chooses to not actively monitor those members of the received domain record that are also present in its own list. Instead, it relies on indications from the direct monitoring nodes if an indirectly monitored node has gone up or down. If a node is indicated lost, the receiving node temporarily activates its own direct monitoring towards that node in order to confirm, or not, that it is actually gone. - Since each node is actively monitoring sqrt(N) downstream neighbors, each node is also actively monitored by the same number of upstream neighbors. This means that all non-direct monitoring nodes normally will receive sqrt(N) indications that a node is gone. - A major drawback with ring monitoring is how it handles failures that cause massive network partitionings. If both a lost node and all its direct monitoring neighbors are inside the lost partition, the nodes in the remaining partition will never receive indications about the loss. To overcome this, each node also chooses to actively monitor some nodes outside its local domain. Those nodes are called remote domain "heads", and are selected in such a way that no node in the cluster will be more than two direct monitoring hops away. Because of this, each node, apart from monitoring the member of its local domain, will also typically monitor sqrt(N) remote head nodes. - As an optimization, local list status, domain status and domain records are marked with a generation number. This saves senders from unnecessarily conveying unaltered domain records, and receivers from performing unneeded re-adaptations of their node monitoring list, such as re-assigning domain heads. - As a measure of caution we have added the possibility to disable the new algorithm through configuration. We do this by keeping a threshold value for the cluster size; a cluster that grows beyond this value will switch from full-mesh to ring monitoring, and vice versa when it shrinks below the value. This means that if the threshold is set to a value larger than any anticipated cluster size (default size is 32) the new algorithm is effectively disabled. A patch set for altering the threshold value and for listing the table contents will follow shortly. - This change is fully backwards compatible. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/Makefile | 2 +- net/tipc/addr.h | 1 + net/tipc/bearer.c | 8 +- net/tipc/bearer.h | 2 +- net/tipc/core.c | 1 + net/tipc/core.h | 15 +- net/tipc/link.c | 49 +++- net/tipc/monitor.c | 651 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/tipc/monitor.h | 73 ++++++ net/tipc/node.c | 26 +-- 10 files changed, 797 insertions(+), 31 deletions(-) create mode 100644 net/tipc/monitor.c create mode 100644 net/tipc/monitor.h (limited to 'net') diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 57e460be4692..31b9f9c52974 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_TIPC) := tipc.o tipc-y += addr.o bcast.o bearer.o \ core.o link.o discover.o msg.o \ - name_distr.o subscr.o name_table.o net.o \ + name_distr.o subscr.o monitor.o name_table.o net.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ server.o socket.o diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 93f7c983be33..64f4004a6fac 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -73,4 +73,5 @@ int tipc_addr_node_valid(u32 addr); int tipc_in_scope(u32 domain, u32 addr); int tipc_addr_scope(u32 domain); char *tipc_addr_string_fill(char *string, u32 addr); + #endif diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 6f11c62bc8f9..9a70e1d744d2 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1,7 +1,7 @@ /* * net/tipc/bearer.c: TIPC bearer code * - * Copyright (c) 1996-2006, 2013-2014, Ericsson AB + * Copyright (c) 1996-2006, 2013-2016, Ericsson AB * Copyright (c) 2004-2006, 2010-2013, Wind River Systems * All rights reserved. * @@ -39,6 +39,7 @@ #include "bearer.h" #include "link.h" #include "discover.h" +#include "monitor.h" #include "bcast.h" #include "netlink.h" @@ -313,6 +314,10 @@ restart: rcu_assign_pointer(tn->bearer_list[bearer_id], b); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); + + if (tipc_mon_create(net, bearer_id)) + return -ENOMEM; + pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n", name, tipc_addr_string_fill(addr_string, disc_domain), priority); @@ -348,6 +353,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) tipc_disc_delete(b->link_req); RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL); kfree_rcu(b, rcu); + tipc_mon_delete(net, bearer_id); } int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index f686e41b5abb..0d337c7b6fad 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -1,7 +1,7 @@ /* * net/tipc/bearer.h: Include file for TIPC bearer code * - * Copyright (c) 1996-2006, 2013-2014, Ericsson AB + * Copyright (c) 1996-2006, 2013-2016, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * diff --git a/net/tipc/core.c b/net/tipc/core.c index fe1b062c4f18..236b043a4156 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -57,6 +57,7 @@ static int __net_init tipc_init_net(struct net *net) tn->net_id = 4711; tn->own_addr = 0; + tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; get_random_bytes(&tn->random, sizeof(int)); INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); diff --git a/net/tipc/core.h b/net/tipc/core.h index eff58dc53aa1..a1845fb27d80 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -66,11 +66,13 @@ struct tipc_bc_base; struct tipc_link; struct tipc_name_table; struct tipc_server; +struct tipc_monitor; #define TIPC_MOD_VER "2.0.0" -#define NODE_HTABLE_SIZE 512 -#define MAX_BEARERS 3 +#define NODE_HTABLE_SIZE 512 +#define MAX_BEARERS 3 +#define TIPC_DEF_MON_THRESHOLD 32 extern int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; @@ -88,6 +90,10 @@ struct tipc_net { u32 num_nodes; u32 num_links; + /* Neighbor monitoring list */ + struct tipc_monitor *monitors[MAX_BEARERS]; + int mon_threshold; + /* Bearer list */ struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1]; @@ -126,6 +132,11 @@ static inline struct list_head *tipc_nodes(struct net *net) return &tipc_net(net)->node_list; } +static inline unsigned int tipc_hashfn(u32 addr) +{ + return addr & (NODE_HTABLE_SIZE - 1); +} + static inline u16 mod(u16 x) { return x & 0xffffu; diff --git a/net/tipc/link.c b/net/tipc/link.c index a904ccd5a93a..03f8bdf70d8f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -42,6 +42,7 @@ #include "name_distr.h" #include "discover.h" #include "netlink.h" +#include "monitor.h" #include @@ -95,6 +96,7 @@ struct tipc_stats { * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') + * @mon_state: cookie with information needed by link monitor * @backlog_limit: backlog queue congestion thresholds (indexed by importance) * @exp_msg_count: # of tunnelled messages expected during link changeover * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset @@ -138,6 +140,7 @@ struct tipc_link { char if_name[TIPC_MAX_IF_NAME]; u32 priority; char net_plane; + struct tipc_mon_state mon_state; u16 rst_cnt; /* Failover/synch */ @@ -708,18 +711,25 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) bool setup = false; u16 bc_snt = l->bc_sndlink->snd_nxt - 1; u16 bc_acked = l->bc_rcvlink->acked; - - link_profile_stats(l); + struct tipc_mon_state *mstate = &l->mon_state; switch (l->state) { case LINK_ESTABLISHED: case LINK_SYNCHING: - if (l->silent_intv_cnt > l->abort_limit) - return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); mtyp = STATE_MSG; + link_profile_stats(l); + tipc_mon_get_state(l->net, l->addr, mstate, l->bearer_id); + if (mstate->reset || (l->silent_intv_cnt > l->abort_limit)) + return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); state = bc_acked != bc_snt; - probe = l->silent_intv_cnt; - l->silent_intv_cnt++; + state |= l->bc_rcvlink->rcv_unacked; + state |= l->rcv_unacked; + state |= !skb_queue_empty(&l->transmq); + state |= !skb_queue_empty(&l->deferdq); + probe = mstate->probing; + probe |= l->silent_intv_cnt; + if (probe || mstate->monitoring) + l->silent_intv_cnt++; break; case LINK_RESET: setup = l->rst_cnt++ <= 4; @@ -830,6 +840,7 @@ void tipc_link_reset(struct tipc_link *l) l->stats.recv_info = 0; l->stale_count = 0; l->bc_peer_is_up = false; + memset(&l->mon_state, 0, sizeof(l->mon_state)); tipc_link_reset_stats(l); } @@ -1238,6 +1249,9 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, struct tipc_msg *hdr; struct sk_buff_head *dfq = &l->deferdq; bool node_up = link_is_up(l->bc_rcvlink); + struct tipc_mon_state *mstate = &l->mon_state; + int dlen = 0; + void *data; /* Don't send protocol message during reset or link failover */ if (tipc_link_is_blocked(l)) @@ -1250,12 +1264,13 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, - TIPC_MAX_IF_NAME, l->addr, + tipc_max_domain_size, l->addr, tipc_own_addr(l->net), 0, 0, 0); if (!skb) return; hdr = buf_msg(skb); + data = msg_data(hdr); msg_set_session(hdr, l->session); msg_set_bearer_id(hdr, l->bearer_id); msg_set_net_plane(hdr, l->net_plane); @@ -1271,14 +1286,18 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, if (mtyp == STATE_MSG) { msg_set_seq_gap(hdr, rcvgap); - msg_set_size(hdr, INT_H_SIZE); msg_set_probe(hdr, probe); + tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id); + msg_set_size(hdr, INT_H_SIZE + dlen); + skb_trim(skb, INT_H_SIZE + dlen); l->stats.sent_states++; l->rcv_unacked = 0; } else { /* RESET_MSG or ACTIVATE_MSG */ msg_set_max_pkt(hdr, l->advertised_mtu); - strcpy(msg_data(hdr), l->if_name); + strcpy(data, l->if_name); + msg_set_size(hdr, INT_H_SIZE + TIPC_MAX_IF_NAME); + skb_trim(skb, INT_H_SIZE + TIPC_MAX_IF_NAME); } if (probe) l->stats.sent_probes++; @@ -1371,7 +1390,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, u16 peers_tol = msg_link_tolerance(hdr); u16 peers_prio = msg_linkprio(hdr); u16 rcv_nxt = l->rcv_nxt; + u16 dlen = msg_data_sz(hdr); int mtyp = msg_type(hdr); + void *data; char *if_name; int rc = 0; @@ -1381,6 +1402,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (tipc_own_addr(l->net) > msg_prevnode(hdr)) l->net_plane = msg_net_plane(hdr); + skb_linearize(skb); + hdr = buf_msg(skb); + data = msg_data(hdr); + switch (mtyp) { case RESET_MSG: @@ -1391,8 +1416,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* fall thru' */ case ACTIVATE_MSG: - skb_linearize(skb); - hdr = buf_msg(skb); /* Complete own link name with peer's interface name */ if_name = strrchr(l->name, ':') + 1; @@ -1400,7 +1423,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) break; - strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME); + strncpy(if_name, data, TIPC_MAX_IF_NAME); /* Update own tolerance if peer indicates a non-zero value */ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) @@ -1448,6 +1471,8 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, rc = TIPC_LINK_UP_EVT; break; } + tipc_mon_rcv(l->net, data, dlen, l->addr, + &l->mon_state, l->bearer_id); /* Send NACK if peer has sent pkts we haven't received yet */ if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c new file mode 100644 index 000000000000..87d4efedd09f --- /dev/null +++ b/net/tipc/monitor.c @@ -0,0 +1,651 @@ +/* + * net/tipc/monitor.c + * + * Copyright (c) 2016, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "addr.h" +#include "monitor.h" + +#define MAX_MON_DOMAIN 64 +#define MON_TIMEOUT 120000 +#define MAX_PEER_DOWN_EVENTS 4 + +/* struct tipc_mon_domain: domain record to be transferred between peers + * @len: actual size of domain record + * @gen: current generation of sender's domain + * @ack_gen: most recent generation of self's domain acked by peer + * @member_cnt: number of domain member nodes described in this record + * @up_map: bit map indicating which of the members the sender considers up + * @members: identity of the domain members + */ +struct tipc_mon_domain { + u16 len; + u16 gen; + u16 ack_gen; + u16 member_cnt; + u64 up_map; + u32 members[MAX_MON_DOMAIN]; +}; + +/* struct tipc_peer: state of a peer node and its domain + * @addr: tipc node identity of peer + * @head_map: shows which other nodes currently consider peer 'up' + * @domain: most recent domain record from peer + * @hash: position in hashed lookup list + * @list: position in linked list, in circular ascending order by 'addr' + * @applied: number of reported domain members applied on this monitor list + * @is_up: peer is up as seen from this node + * @is_head: peer is assigned domain head as seen from this node + * @is_local: peer is in local domain and should be continuously monitored + * @down_cnt: - numbers of other peers which have reported this on lost + */ +struct tipc_peer { + u32 addr; + struct tipc_mon_domain *domain; + struct hlist_node hash; + struct list_head list; + u8 applied; + u8 down_cnt; + bool is_up; + bool is_head; + bool is_local; +}; + +struct tipc_monitor { + struct hlist_head peers[NODE_HTABLE_SIZE]; + int peer_cnt; + struct tipc_peer *self; + rwlock_t lock; + struct tipc_mon_domain cache; + u16 list_gen; + u16 dom_gen; + struct net *net; + struct timer_list timer; + unsigned long timer_intv; +}; + +static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id) +{ + return tipc_net(net)->monitors[bearer_id]; +} + +const int tipc_max_domain_size = sizeof(struct tipc_mon_domain); + +/* dom_rec_len(): actual length of domain record for transport + */ +static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt) +{ + return ((void *)&dom->members - (void *)dom) + (mcnt * sizeof(u32)); +} + +/* dom_size() : calculate size of own domain based on number of peers + */ +static int dom_size(int peers) +{ + int i = 0; + + while ((i * i) < peers) + i++; + return i < MAX_MON_DOMAIN ? i : MAX_MON_DOMAIN; +} + +static void map_set(u64 *up_map, int i, unsigned int v) +{ + *up_map &= ~(1 << i); + *up_map |= (v << i); +} + +static int map_get(u64 up_map, int i) +{ + return (up_map & (1 << i)) >> i; +} + +static struct tipc_peer *peer_prev(struct tipc_peer *peer) +{ + return list_last_entry(&peer->list, struct tipc_peer, list); +} + +static struct tipc_peer *peer_nxt(struct tipc_peer *peer) +{ + return list_first_entry(&peer->list, struct tipc_peer, list); +} + +static struct tipc_peer *peer_head(struct tipc_peer *peer) +{ + while (!peer->is_head) + peer = peer_prev(peer); + return peer; +} + +static struct tipc_peer *get_peer(struct tipc_monitor *mon, u32 addr) +{ + struct tipc_peer *peer; + unsigned int thash = tipc_hashfn(addr); + + hlist_for_each_entry(peer, &mon->peers[thash], hash) { + if (peer->addr == addr) + return peer; + } + return NULL; +} + +static struct tipc_peer *get_self(struct net *net, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + + return mon->self; +} + +static inline bool tipc_mon_is_active(struct net *net, struct tipc_monitor *mon) +{ + struct tipc_net *tn = tipc_net(net); + + return mon->peer_cnt > tn->mon_threshold; +} + +/* mon_identify_lost_members() : - identify amd mark potentially lost members + */ +static void mon_identify_lost_members(struct tipc_peer *peer, + struct tipc_mon_domain *dom_bef, + int applied_bef) +{ + struct tipc_peer *member = peer; + struct tipc_mon_domain *dom_aft = peer->domain; + int applied_aft = peer->applied; + int i; + + for (i = 0; i < applied_bef; i++) { + member = peer_nxt(member); + + /* Do nothing if self or peer already see member as down */ + if (!member->is_up || !map_get(dom_bef->up_map, i)) + continue; + + /* Loss of local node must be detected by active probing */ + if (member->is_local) + continue; + + /* Start probing if member was removed from applied domain */ + if (!applied_aft || (applied_aft < i)) { + member->down_cnt = 1; + continue; + } + + /* Member loss is confirmed if it is still in applied domain */ + if (!map_get(dom_aft->up_map, i)) + member->down_cnt++; + } +} + +/* mon_apply_domain() : match a peer's domain record against monitor list + */ +static void mon_apply_domain(struct tipc_monitor *mon, + struct tipc_peer *peer) +{ + struct tipc_mon_domain *dom = peer->domain; + struct tipc_peer *member; + u32 addr; + int i; + + if (!dom || !peer->is_up) + return; + + /* Scan across domain members and match against monitor list */ + peer->applied = 0; + member = peer_nxt(peer); + for (i = 0; i < dom->member_cnt; i++) { + addr = dom->members[i]; + if (addr != member->addr) + return; + peer->applied++; + member = peer_nxt(member); + } +} + +/* mon_update_local_domain() : update after peer addition/removal/up/down + */ +static void mon_update_local_domain(struct tipc_monitor *mon) +{ + struct tipc_peer *self = mon->self; + struct tipc_mon_domain *cache = &mon->cache; + struct tipc_mon_domain *dom = self->domain; + struct tipc_peer *peer = self; + u64 prev_up_map = dom->up_map; + u16 member_cnt, i; + bool diff; + + /* Update local domain size based on current size of cluster */ + member_cnt = dom_size(mon->peer_cnt) - 1; + self->applied = member_cnt; + + /* Update native and cached outgoing local domain records */ + dom->len = dom_rec_len(dom, member_cnt); + diff = dom->member_cnt != member_cnt; + dom->member_cnt = member_cnt; + for (i = 0; i < member_cnt; i++) { + peer = peer_nxt(peer); + diff |= dom->members[i] != peer->addr; + dom->members[i] = peer->addr; + map_set(&dom->up_map, i, peer->is_up); + cache->members[i] = htonl(peer->addr); + } + diff |= dom->up_map != prev_up_map; + if (!diff) + return; + dom->gen = ++mon->dom_gen; + cache->len = htons(dom->len); + cache->gen = htons(dom->gen); + cache->member_cnt = htons(member_cnt); + cache->up_map = cpu_to_be64(dom->up_map); + mon_apply_domain(mon, self); +} + +/* mon_update_neighbors() : update preceding neighbors of added/removed peer + */ +static void mon_update_neighbors(struct tipc_monitor *mon, + struct tipc_peer *peer) +{ + int dz, i; + + dz = dom_size(mon->peer_cnt); + for (i = 0; i < dz; i++) { + mon_apply_domain(mon, peer); + peer = peer_prev(peer); + } +} + +/* mon_assign_roles() : reassign peer roles after a network change + * The monitor list is consistent at this stage; i.e., each peer is monitoring + * a set of domain members as matched between domain record and the monitor list + */ +static void mon_assign_roles(struct tipc_monitor *mon, struct tipc_peer *head) +{ + struct tipc_peer *peer = peer_nxt(head); + struct tipc_peer *self = mon->self; + int i = 0; + + for (; peer != self; peer = peer_nxt(peer)) { + peer->is_local = false; + + /* Update domain member */ + if (i++ < head->applied) { + peer->is_head = false; + if (head == self) + peer->is_local = true; + continue; + } + /* Assign next domain head */ + if (!peer->is_up) + continue; + if (peer->is_head) + break; + head = peer; + head->is_head = true; + i = 0; + } + mon->list_gen++; +} + +void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *prev, *head; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer) + goto exit; + prev = peer_prev(peer); + list_del(&peer->list); + hlist_del(&peer->hash); + kfree(peer->domain); + kfree(peer); + mon->peer_cnt--; + head = peer_head(prev); + if (head == self) + mon_update_local_domain(mon); + mon_update_neighbors(mon, prev); + + /* Revert to full-mesh monitoring if we reach threshold */ + if (!tipc_mon_is_active(net, mon)) { + list_for_each_entry(peer, &self->list, list) { + kfree(peer->domain); + peer->domain = NULL; + peer->applied = 0; + } + } + mon_assign_roles(mon, head); +exit: + write_unlock_bh(&mon->lock); +} + +static bool tipc_mon_add_peer(struct tipc_monitor *mon, u32 addr, + struct tipc_peer **peer) +{ + struct tipc_peer *self = mon->self; + struct tipc_peer *cur, *prev, *p; + + p = kzalloc(sizeof(*p), GFP_ATOMIC); + *peer = p; + if (!p) + return false; + p->addr = addr; + + /* Add new peer to lookup list */ + INIT_LIST_HEAD(&p->list); + hlist_add_head(&p->hash, &mon->peers[tipc_hashfn(addr)]); + + /* Sort new peer into iterator list, in ascending circular order */ + prev = self; + list_for_each_entry(cur, &self->list, list) { + if ((addr > prev->addr) && (addr < cur->addr)) + break; + if (((addr < cur->addr) || (addr > prev->addr)) && + (prev->addr > cur->addr)) + break; + prev = cur; + } + list_add_tail(&p->list, &cur->list); + mon->peer_cnt++; + mon_update_neighbors(mon, p); + return true; +} + +void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *head; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer && !tipc_mon_add_peer(mon, addr, &peer)) + goto exit; + peer->is_up = true; + head = peer_head(peer); + if (head == self) + mon_update_local_domain(mon); + mon_assign_roles(mon, head); +exit: + write_unlock_bh(&mon->lock); +} + +void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *head; + struct tipc_mon_domain *dom; + int applied; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer) { + pr_warn("Mon: unknown link %x/%u DOWN\n", addr, bearer_id); + goto exit; + } + applied = peer->applied; + peer->applied = 0; + dom = peer->domain; + peer->domain = NULL; + if (peer->is_head) + mon_identify_lost_members(peer, dom, applied); + kfree(dom); + peer->is_up = false; + peer->is_head = false; + peer->is_local = false; + peer->down_cnt = 0; + head = peer_head(peer); + if (head == self) + mon_update_local_domain(mon); + mon_assign_roles(mon, head); +exit: + write_unlock_bh(&mon->lock); +} + +/* tipc_mon_rcv - process monitor domain event message + */ +void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, + struct tipc_mon_state *state, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_mon_domain *arrv_dom = data; + struct tipc_mon_domain dom_bef; + struct tipc_mon_domain *dom; + struct tipc_peer *peer; + u16 new_member_cnt = ntohs(arrv_dom->member_cnt); + int new_dlen = dom_rec_len(arrv_dom, new_member_cnt); + u16 new_gen = ntohs(arrv_dom->gen); + u16 acked_gen = ntohs(arrv_dom->ack_gen); + bool probing = state->probing; + int i, applied_bef; + + state->probing = false; + if (!dlen) + return; + + /* Sanity check received domain record */ + if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen) { + pr_warn_ratelimited("Received illegal domain record\n"); + return; + } + + /* Synch generation numbers with peer if link just came up */ + if (!state->synched) { + state->peer_gen = new_gen - 1; + state->acked_gen = acked_gen; + state->synched = true; + } + + if (more(acked_gen, state->acked_gen)) + state->acked_gen = acked_gen; + + /* Drop duplicate unless we are waiting for a probe response */ + if (!more(new_gen, state->peer_gen) && !probing) + return; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer || !peer->is_up) + goto exit; + + /* Peer is confirmed, stop any ongoing probing */ + peer->down_cnt = 0; + + /* Task is done for duplicate record */ + if (!more(new_gen, state->peer_gen)) + goto exit; + + state->peer_gen = new_gen; + + /* Cache current domain record for later use */ + dom_bef.member_cnt = 0; + dom = peer->domain; + if (dom) + memcpy(&dom_bef, dom, dom->len); + + /* Transform and store received domain record */ + if (!dom || (dom->len < new_dlen)) { + kfree(dom); + dom = kmalloc(new_dlen, GFP_ATOMIC); + peer->domain = dom; + if (!dom) + goto exit; + } + dom->len = new_dlen; + dom->gen = new_gen; + dom->member_cnt = new_member_cnt; + dom->up_map = be64_to_cpu(arrv_dom->up_map); + for (i = 0; i < new_member_cnt; i++) + dom->members[i] = ntohl(arrv_dom->members[i]); + + /* Update peers affected by this domain record */ + applied_bef = peer->applied; + mon_apply_domain(mon, peer); + mon_identify_lost_members(peer, &dom_bef, applied_bef); + mon_assign_roles(mon, peer_head(peer)); +exit: + write_unlock_bh(&mon->lock); +} + +void tipc_mon_prep(struct net *net, void *data, int *dlen, + struct tipc_mon_state *state, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_mon_domain *dom = data; + u16 gen = mon->dom_gen; + u16 len; + + if (!tipc_mon_is_active(net, mon)) + return; + + /* Send only a dummy record with ack if peer has acked our last sent */ + if (likely(state->acked_gen == gen)) { + len = dom_rec_len(dom, 0); + *dlen = len; + dom->len = htons(len); + dom->gen = htons(gen); + dom->ack_gen = htons(state->peer_gen); + dom->member_cnt = 0; + return; + } + /* Send the full record */ + read_lock_bh(&mon->lock); + len = ntohs(mon->cache.len); + *dlen = len; + memcpy(data, &mon->cache, len); + read_unlock_bh(&mon->lock); + dom->ack_gen = htons(state->peer_gen); +} + +void tipc_mon_get_state(struct net *net, u32 addr, + struct tipc_mon_state *state, + int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *peer; + + /* Used cached state if table has not changed */ + if (!state->probing && + (state->list_gen == mon->list_gen) && + (state->acked_gen == mon->dom_gen)) + return; + + read_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (peer) { + state->probing = state->acked_gen != mon->dom_gen; + state->probing |= peer->down_cnt; + state->reset |= peer->down_cnt >= MAX_PEER_DOWN_EVENTS; + state->monitoring = peer->is_local; + state->monitoring |= peer->is_head; + state->list_gen = mon->list_gen; + } + read_unlock_bh(&mon->lock); +} + +static void mon_timeout(unsigned long m) +{ + struct tipc_monitor *mon = (void *)m; + struct tipc_peer *self; + int best_member_cnt = dom_size(mon->peer_cnt) - 1; + + write_lock_bh(&mon->lock); + self = mon->self; + if (self && (best_member_cnt != self->applied)) { + mon_update_local_domain(mon); + mon_assign_roles(mon, self); + } + write_unlock_bh(&mon->lock); + mod_timer(&mon->timer, jiffies + mon->timer_intv); +} + +int tipc_mon_create(struct net *net, int bearer_id) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_monitor *mon; + struct tipc_peer *self; + struct tipc_mon_domain *dom; + + if (tn->monitors[bearer_id]) + return 0; + + mon = kzalloc(sizeof(*mon), GFP_ATOMIC); + self = kzalloc(sizeof(*self), GFP_ATOMIC); + dom = kzalloc(sizeof(*dom), GFP_ATOMIC); + if (!mon || !self || !dom) { + kfree(mon); + kfree(self); + kfree(dom); + return -ENOMEM; + } + tn->monitors[bearer_id] = mon; + rwlock_init(&mon->lock); + mon->net = net; + mon->peer_cnt = 1; + mon->self = self; + self->domain = dom; + self->addr = tipc_own_addr(net); + self->is_up = true; + self->is_head = true; + INIT_LIST_HEAD(&self->list); + setup_timer(&mon->timer, mon_timeout, (unsigned long)mon); + mon->timer_intv = msecs_to_jiffies(MON_TIMEOUT + (tn->random & 0xffff)); + mod_timer(&mon->timer, jiffies + mon->timer_intv); + return 0; +} + +void tipc_mon_delete(struct net *net, int bearer_id) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *tmp; + + write_lock_bh(&mon->lock); + tn->monitors[bearer_id] = NULL; + list_for_each_entry_safe(peer, tmp, &self->list, list) { + list_del(&peer->list); + hlist_del(&peer->hash); + kfree(peer->domain); + kfree(peer); + } + mon->self = NULL; + write_unlock_bh(&mon->lock); + del_timer_sync(&mon->timer); + kfree(self->domain); + kfree(self); + kfree(mon); +} diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h new file mode 100644 index 000000000000..598459cbed5d --- /dev/null +++ b/net/tipc/monitor.h @@ -0,0 +1,73 @@ +/* + * net/tipc/monitor.h + * + * Copyright (c) 2015, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_MONITOR_H +#define _TIPC_MONITOR_H + +/* struct tipc_mon_state: link instance's cache of monitor list and domain state + * @list_gen: current generation of this node's monitor list + * @gen: current generation of this node's local domain + * @peer_gen: most recent domain generation received from peer + * @acked_gen: most recent generation of self's domain acked by peer + * @monitoring: this peer endpoint should continuously monitored + * @probing: peer endpoint should be temporarily probed for potential loss + * @synched: domain record's generation has been synched with peer after reset + */ +struct tipc_mon_state { + u16 list_gen; + u16 peer_gen; + u16 acked_gen; + bool monitoring :1; + bool probing :1; + bool reset :1; + bool synched :1; +}; + +int tipc_mon_create(struct net *net, int bearer_id); +void tipc_mon_delete(struct net *net, int bearer_id); + +void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id); +void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id); +void tipc_mon_prep(struct net *net, void *data, int *dlen, + struct tipc_mon_state *state, int bearer_id); +void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, + struct tipc_mon_state *state, int bearer_id); +void tipc_mon_get_state(struct net *net, u32 addr, + struct tipc_mon_state *state, + int bearer_id); +void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id); + +extern const int tipc_max_domain_size; +#endif diff --git a/net/tipc/node.c b/net/tipc/node.c index d6a490f991a4..a3fc0a3f4077 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -40,6 +40,7 @@ #include "name_distr.h" #include "socket.h" #include "bcast.h" +#include "monitor.h" #include "discover.h" #include "netlink.h" @@ -205,17 +206,6 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr) return caps; } -/* - * A trivial power-of-two bitmask technique is used for speed, since this - * operation is done for every incoming TIPC packet. The number of hash table - * entries has been chosen so that no hash chain exceeds 8 nodes and will - * usually be much smaller (typically only a single node). - */ -static unsigned int tipc_hashfn(u32 addr) -{ - return addr & (NODE_HTABLE_SIZE - 1); -} - static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); @@ -279,6 +269,7 @@ static void tipc_node_write_unlock(struct tipc_node *n) u32 addr = 0; u32 flags = n->action_flags; u32 link_id = 0; + u32 bearer_id; struct list_head *publ_list; if (likely(!flags)) { @@ -288,6 +279,7 @@ static void tipc_node_write_unlock(struct tipc_node *n) addr = n->addr; link_id = n->link_id; + bearer_id = link_id & 0xffff; publ_list = &n->publ_list; n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | @@ -301,13 +293,16 @@ static void tipc_node_write_unlock(struct tipc_node *n) if (flags & TIPC_NOTIFY_NODE_UP) tipc_named_node_up(net, addr); - if (flags & TIPC_NOTIFY_LINK_UP) + if (flags & TIPC_NOTIFY_LINK_UP) { + tipc_mon_peer_up(net, addr, bearer_id); tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr, TIPC_NODE_SCOPE, link_id, addr); - - if (flags & TIPC_NOTIFY_LINK_DOWN) + } + if (flags & TIPC_NOTIFY_LINK_DOWN) { + tipc_mon_peer_down(net, addr, bearer_id); tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr, link_id, addr); + } } struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) @@ -691,6 +686,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) struct tipc_link *l = le->link; struct tipc_media_addr *maddr; struct sk_buff_head xmitq; + int old_bearer_id = bearer_id; if (!l) return; @@ -710,6 +706,8 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) tipc_link_fsm_evt(l, LINK_RESET_EVT); } tipc_node_write_unlock(n); + if (delete) + tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); tipc_sk_rcv(n->net, &le->inputq); } -- cgit v1.2.3 From 1b5c5493e3e68181be344cb51bf9df192d05ffc2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jun 2016 20:21:50 -0700 Subject: net_sched: add the ability to defer skb freeing qdisc are changed under RTNL protection and often while blocking BH and root qdisc spinlock. When lots of skbs need to be dropped, we free them under these locks causing TX/RX freezes, and more generally latency spikes. This commit adds rtnl_kfree_skbs(), used to queue skbs for deferred freeing. Actual freeing happens right after RTNL is released, with appropriate scheduling points. rtnl_qdisc_drop() can also be used in place of disc_drop() when RTNL is held. qdisc_reset_queue() and __qdisc_reset_queue() get the new behavior, so standard qdiscs like pfifo, pfifo_fast... have their ->reset() method automatically handled. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 22 ++++++++++++++++++++++ net/sched/sch_generic.c | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d69c4644f8f2..eb49ca24274a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -71,9 +71,31 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +static struct sk_buff *defer_kfree_skb_list; +void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) +{ + if (head && tail) { + tail->next = defer_kfree_skb_list; + defer_kfree_skb_list = head; + } +} +EXPORT_SYMBOL(rtnl_kfree_skbs); + void __rtnl_unlock(void) { + struct sk_buff *head = defer_kfree_skb_list; + + defer_kfree_skb_list = NULL; + mutex_unlock(&rtnl_mutex); + + while (head) { + struct sk_buff *next = head->next; + + kfree_skb(head); + cond_resched(); + head = next; + } } void rtnl_unlock(void) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 0c9cb516f2e3..773b632e1e33 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -493,7 +493,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc) struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(qdisc, band2list(priv, prio)); + __qdisc_reset_queue(band2list(priv, prio)); priv->bitmap = 0; qdisc->qstats.backlog = 0; -- cgit v1.2.3 From f9aed311b682ca933ca78821cce3e4f1d69d4f56 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jun 2016 20:21:51 -0700 Subject: net_sched: sch_choke: defer skb freeing choke_reset() and choke_change() can use rtnl_qdisc_drop() to defer expensive skb freeing after locks are released. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_choke.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 04e0b0583e00..789b69ee9e51 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -375,11 +375,11 @@ static void choke_reset(struct Qdisc *sch) q->head = (q->head + 1) & q->tab_mask; if (!skb) continue; - qdisc_qstats_backlog_dec(sch, skb); - --sch->q.qlen; - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } + sch->q.qlen = 0; + sch->qstats.backlog = 0; memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); q->head = q->tail = 0; red_restart(&q->vars); @@ -455,7 +455,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); --sch->q.qlen; - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped); q->head = 0; -- cgit v1.2.3 From b3d7e2b29b226c986cbd4efcaf43ab3ff90e6fdb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jun 2016 20:21:52 -0700 Subject: net_sched: sch_codel: defer skb freeing in codel_change() codel_change() can use rtnl_qdisc_drop() to defer expensive skb freeing after locks are released. codel_reset() already has support for deferred skb freeing because it uses qdisc_reset_queue() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_codel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index dddf3bb65a32..c5bc424e3b3c 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -174,7 +174,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); -- cgit v1.2.3 From e14ffdfdd67a9eba36bd76af0df1f2ac363ae906 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jun 2016 20:21:53 -0700 Subject: net_sched: sch_fq: defer skb freeing Both fq_change() and fq_reset() can use rtnl_kfree_skbs() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index f49c81e91acd..6eb06674f778 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -514,17 +514,25 @@ out: return skb; } +static void fq_flow_purge(struct fq_flow *flow) +{ + rtnl_kfree_skbs(flow->head, flow->tail); + flow->head = NULL; + flow->qlen = 0; +} + static void fq_reset(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); struct rb_root *root; - struct sk_buff *skb; struct rb_node *p; struct fq_flow *f; unsigned int idx; - while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL) - kfree_skb(skb); + sch->q.qlen = 0; + sch->qstats.backlog = 0; + + fq_flow_purge(&q->internal); if (!q->fq_root) return; @@ -535,8 +543,7 @@ static void fq_reset(struct Qdisc *sch) f = container_of(p, struct fq_flow, fq_node); rb_erase(p, root); - while ((skb = fq_dequeue_head(sch, f)) != NULL) - kfree_skb(skb); + fq_flow_purge(f); kmem_cache_free(fq_flow_cachep, f); } @@ -737,7 +744,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (!skb) break; drop_len += qdisc_pkt_len(skb); - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); drop_count++; } qdisc_tree_reduce_backlog(sch, drop_count, drop_len); -- cgit v1.2.3 From ece5d4c723b69a4aa0cd1f545ebbdc3a37c80dc6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jun 2016 20:21:54 -0700 Subject: net_sched: fq_codel: defer skb freeing Both fq_codel_change() and fq_codel_reset() can use rtnl_kfree_skbs() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq_codel.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index a302e8ef5498..2dc0a849515a 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -336,6 +336,12 @@ begin: return skb; } +static void fq_codel_flow_purge(struct fq_codel_flow *flow) +{ + rtnl_kfree_skbs(flow->head, flow->tail); + flow->head = NULL; +} + static void fq_codel_reset(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); @@ -346,18 +352,13 @@ static void fq_codel_reset(struct Qdisc *sch) for (i = 0; i < q->flows_cnt; i++) { struct fq_codel_flow *flow = q->flows + i; - while (flow->head) { - struct sk_buff *skb = dequeue_head(flow); - - qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); - } - + fq_codel_flow_purge(flow); INIT_LIST_HEAD(&flow->flowchain); codel_vars_init(&flow->cvars); } memset(q->backlogs, 0, q->flows_cnt * sizeof(u32)); sch->q.qlen = 0; + sch->qstats.backlog = 0; q->memory_usage = 0; } @@ -433,7 +434,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) struct sk_buff *skb = fq_codel_dequeue(sch); q->cstats.drop_len += qdisc_pkt_len(skb); - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); q->cstats.drop_count++; } qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); -- cgit v1.2.3 From e7e424cdc4b2fcd7507b71d3a931708d11d5a61e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jun 2016 20:21:55 -0700 Subject: net_sched: sch_hhf: defer skb freeing Both hhf_reset() and hhf_change() can use rtnl_kfree_skbs() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_hhf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index c51791848a38..c44593b8e65a 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -464,7 +464,7 @@ static void hhf_reset(struct Qdisc *sch) struct sk_buff *skb; while ((skb = hhf_dequeue(sch)) != NULL) - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } static void *hhf_zalloc(size_t sz) @@ -574,7 +574,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = hhf_dequeue(sch); - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, prev_backlog - sch->qstats.backlog); -- cgit v1.2.3 From a5a9f5346fb95c00d47e8b4648f6d5eada147cd4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jun 2016 20:21:56 -0700 Subject: net_sched: sch_htb: defer skb freeing Both htb_reset() and htb_destroy() can use __qdisc_reset_queue() instead of __skb_queue_purge() to defer skb freeing of internal queues. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 07dcd2933f01..a454605ab5cb 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -957,7 +957,7 @@ static void htb_reset(struct Qdisc *sch) } } qdisc_watchdog_cancel(&q->watchdog); - __skb_queue_purge(&q->direct_queue); + __qdisc_reset_queue(&q->direct_queue); sch->q.qlen = 0; sch->qstats.backlog = 0; memset(q->hlevel, 0, sizeof(q->hlevel)); @@ -1231,7 +1231,7 @@ static void htb_destroy(struct Qdisc *sch) htb_destroy_class(sch, cl); } qdisc_class_hash_destroy(&q->clhash); - __skb_queue_purge(&q->direct_queue); + __qdisc_reset_queue(&q->direct_queue); } static int htb_delete(struct Qdisc *sch, unsigned long arg) -- cgit v1.2.3 From 2f08a9a16288b60df3ddfe97c965427ce0163297 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jun 2016 20:21:57 -0700 Subject: net_sched: sch_netem: defer skb freeing rtnl_kfree_skbs() can be used in tfifo_reset() It would be nice if we could iterate through rb tree instead of removing one skb at a time, and build a single skb chain. But this is left for a future patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 876df13c745a..e271967439bf 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -368,9 +368,7 @@ static void tfifo_reset(struct Qdisc *sch) struct sk_buff *skb = netem_rb_to_skb(p); rb_erase(p, &q->t_root); - skb->next = NULL; - skb->prev = NULL; - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } } -- cgit v1.2.3 From db4879d93c351cb978db1eb4c963f44d267d63a2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jun 2016 20:21:58 -0700 Subject: net_sched: sch_pie: defer skb freeing pie_change() can use rtnl_qdisc_drop() to benefit from deferred freeing. pie_reset() is already using qdisc_reset_queue() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_pie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 71ae3b9629f9..912a46a5d02e 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -234,7 +234,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); -- cgit v1.2.3 From fea024784f588a1c50e7718d6053697ebdcc033e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jun 2016 20:21:59 -0700 Subject: net_sched: sch_fq: defer skb freeing sfq_reset() can use rtnl_kfree_skbs() instead of kfree_skb() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_sfq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index a2e0b855d1c8..57d118b41cad 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -520,7 +520,7 @@ sfq_reset(struct Qdisc *sch) struct sk_buff *skb; while ((skb = sfq_dequeue(sch)) != NULL) - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } /* -- cgit v1.2.3 From 8626a0c83b0d471d859bcd908d016874df951fc3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:16 +0200 Subject: 6lowpan: add private neighbour data This patch will introduce a 6lowpan neighbour private data. Like the interface private data we handle private data for generic 6lowpan and for link-layer specific 6lowpan. The current first use case if to save the short address for a 802.15.4 6lowpan neighbour. Cc: David S. Miller Reviewed-by: Stefan Schmidt Acked-by: YOSHIFUJI Hideaki Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/ieee802154/6lowpan/core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 4e2b30894224..8c004a0c8d64 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -81,11 +81,21 @@ static int lowpan_stop(struct net_device *dev) return 0; } +static int lowpan_neigh_construct(struct neighbour *n) +{ + struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); + + /* default no short_addr is available for a neighbour */ + neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + return 0; +} + static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, + .ndo_neigh_construct = lowpan_neigh_construct, }; static void lowpan_setup(struct net_device *ldev) @@ -150,6 +160,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, wdev->needed_headroom; ldev->needed_tailroom = wdev->needed_tailroom; + ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh); + ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154); if (ret < 0) { dev_put(wdev); -- cgit v1.2.3 From 2ad3ed59198c5404c34515cfcfd9a2b3c54d964f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:17 +0200 Subject: 6lowpan: add 802.15.4 short addr slaac This patch adds the autoconfiguration if a valid 802.15.4 short address is available for 802.15.4 6LoWPAN interfaces. Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Acked-by: Hannes Frederic Sowa Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/6lowpan/core.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/addrconf.c | 5 +++-- 2 files changed, 49 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 7a240b3eaed1..801404ceea23 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -14,6 +14,7 @@ #include #include +#include #include "6lowpan_i.h" @@ -72,16 +73,61 @@ void lowpan_unregister_netdev(struct net_device *dev) } EXPORT_SYMBOL(lowpan_unregister_netdev); +static int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) +{ + struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + /* Set short_addr autoconfiguration if short_addr is present only */ + if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) + return -1; + + /* For either address format, all zero addresses MUST NOT be used */ + if (wpan_dev->pan_id == cpu_to_le16(0x0000) && + wpan_dev->short_addr == cpu_to_le16(0x0000)) + return -1; + + /* Alternatively, if no PAN ID is known, 16 zero bits may be used */ + if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) + memset(eui, 0, 2); + else + ieee802154_le16_to_be16(eui, &wpan_dev->pan_id); + + /* The "Universal/Local" (U/L) bit shall be set to zero */ + eui[0] &= ~2; + eui[2] = 0; + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[5] = 0; + ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr); + return 0; +} + static int lowpan_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct inet6_dev *idev; + struct in6_addr addr; int i; if (dev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; + idev = __in6_dev_get(dev); + if (!idev) + return NOTIFY_DONE; + switch (event) { + case NETDEV_UP: + case NETDEV_CHANGE: + /* (802.15.4 6LoWPAN short address slaac handling */ + if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && + addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) { + __ipv6_addr_set_half(&addr.s6_addr32[0], + htonl(0xFE800000), 0); + addrconf_add_linklocal(idev, &addr, 0); + } + break; case NETDEV_DOWN: for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b12553905e42..1ce4048d1b5e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2947,8 +2947,8 @@ static void init_loopback(struct net_device *dev) } } -static void addrconf_add_linklocal(struct inet6_dev *idev, - const struct in6_addr *addr, u32 flags) +void addrconf_add_linklocal(struct inet6_dev *idev, + const struct in6_addr *addr, u32 flags) { struct inet6_ifaddr *ifp; u32 addr_flags = flags | IFA_F_PERMANENT; @@ -2967,6 +2967,7 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, in6_ifa_put(ifp); } } +EXPORT_SYMBOL_GPL(addrconf_add_linklocal); static bool ipv6_reserved_interfaceid(struct in6_addr address) { -- cgit v1.2.3 From 848484c93128eae5b0f879ad1c2d7204c10a8b6a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:18 +0200 Subject: 6lowpan: remove ipv6 module request Since we use exported function from ipv6 kernel module we don't need to request the module anymore to have ipv6 functionality. Acked-by: Hannes Frederic Sowa Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/6lowpan/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 801404ceea23..1c7a42b48524 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -158,8 +158,6 @@ static int __init lowpan_module_init(void) return ret; } - request_module_nowait("ipv6"); - request_module_nowait("nhc_dest"); request_module_nowait("nhc_fragment"); request_module_nowait("nhc_hop"); -- cgit v1.2.3 From 8ec5da41502843947af0b09e795b19fc7a83edd8 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:21 +0200 Subject: ndisc: add __ndisc_fill_addr_option function This patch adds __ndisc_fill_addr_option as low-level function for ndisc_fill_addr_option which doesn't depend on net_device parameter. Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Acked-by: YOSHIFUJI Hideaki Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index c245895a3d41..a7b9468e684a 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -150,11 +150,10 @@ struct neigh_table nd_tbl = { }; EXPORT_SYMBOL_GPL(nd_tbl); -static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data) +static void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, + int data_len, int pad) { - int pad = ndisc_addr_option_pad(skb->dev->type); - int data_len = skb->dev->addr_len; - int space = ndisc_opt_addr_space(skb->dev); + int space = __ndisc_opt_addr_space(data_len, pad); u8 *opt = skb_put(skb, space); opt[0] = type; @@ -172,6 +171,13 @@ static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data) memset(opt, 0, space); } +static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, + void *data) +{ + __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, + ndisc_addr_option_pad(skb->dev->type)); +} + static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { -- cgit v1.2.3 From 4f672235cb11c49d4be7ac7d505c65e3bd367322 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:22 +0200 Subject: addrconf: put prefix address add in an own function This patch moves the functionality to add a RA PIO prefix generated address in an own function. This move prepares to add a hook for adding a second address for a second link-layer address. E.g. short address for 802.15.4 6LoWPAN. Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Reviewed-by: Stefan Schmidt Acked-by: YOSHIFUJI Hideaki Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 203 ++++++++++++++++++++++++++++------------------------ 1 file changed, 109 insertions(+), 94 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 1ce4048d1b5e..44c1cefdb299 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2333,12 +2333,110 @@ static bool is_addr_mode_generate_stable(struct inet6_dev *idev) idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; } +static int addrconf_prefix_rcv_add_addr(struct net *net, + struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + const struct in6_addr *addr, + int addr_type, u32 addr_flags, + bool sllao, bool tokenized, + __u32 valid_lft, u32 prefered_lft) +{ + struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1); + int create = 0, update_lft = 0; + + if (!ifp && valid_lft) { + int max_addresses = in6_dev->cnf.max_addresses; + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + if (in6_dev->cnf.optimistic_dad && + !net->ipv6.devconf_all->forwarding && sllao) + addr_flags |= IFA_F_OPTIMISTIC; +#endif + + /* Do not allow to create too much of autoconfigured + * addresses; this would be too easy way to crash kernel. + */ + if (!max_addresses || + ipv6_count_addresses(in6_dev) < max_addresses) + ifp = ipv6_add_addr(in6_dev, addr, NULL, + pinfo->prefix_len, + addr_type&IPV6_ADDR_SCOPE_MASK, + addr_flags, valid_lft, + prefered_lft); + + if (IS_ERR_OR_NULL(ifp)) + return -1; + + update_lft = 0; + create = 1; + spin_lock_bh(&ifp->lock); + ifp->flags |= IFA_F_MANAGETEMPADDR; + ifp->cstamp = jiffies; + ifp->tokenized = tokenized; + spin_unlock_bh(&ifp->lock); + addrconf_dad_start(ifp); + } + + if (ifp) { + u32 flags; + unsigned long now; + u32 stored_lft; + + /* update lifetime (RFC2462 5.5.3 e) */ + spin_lock_bh(&ifp->lock); + now = jiffies; + if (ifp->valid_lft > (now - ifp->tstamp) / HZ) + stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; + else + stored_lft = 0; + if (!update_lft && !create && stored_lft) { + const u32 minimum_lft = min_t(u32, + stored_lft, MIN_VALID_LIFETIME); + valid_lft = max(valid_lft, minimum_lft); + + /* RFC4862 Section 5.5.3e: + * "Note that the preferred lifetime of the + * corresponding address is always reset to + * the Preferred Lifetime in the received + * Prefix Information option, regardless of + * whether the valid lifetime is also reset or + * ignored." + * + * So we should always update prefered_lft here. + */ + update_lft = 1; + } + + if (update_lft) { + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = now; + flags = ifp->flags; + ifp->flags &= ~IFA_F_DEPRECATED; + spin_unlock_bh(&ifp->lock); + + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ifp); + } else + spin_unlock_bh(&ifp->lock); + + manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft, + create, now); + + in6_ifa_put(ifp); + addrconf_verify(); + } + + return 0; +} + void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; __u32 valid_lft; __u32 prefered_lft; - int addr_type; + int addr_type, err; u32 addr_flags = 0; struct inet6_dev *in6_dev; struct net *net = dev_net(dev); @@ -2432,9 +2530,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) /* Try to figure out our local address for this prefix */ if (pinfo->autoconf && in6_dev->cnf.autoconf) { - struct inet6_ifaddr *ifp; struct in6_addr addr; - int create = 0, update_lft = 0; bool tokenized = false; if (pinfo->prefix_len == 64) { @@ -2453,106 +2549,25 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) goto ok; } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { - in6_dev_put(in6_dev); - return; + goto put; } goto ok; } net_dbg_ratelimited("IPv6 addrconf: prefix with wrong length %d\n", pinfo->prefix_len); - in6_dev_put(in6_dev); - return; + goto put; ok: - - ifp = ipv6_get_ifaddr(net, &addr, dev, 1); - - if (!ifp && valid_lft) { - int max_addresses = in6_dev->cnf.max_addresses; - -#ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (in6_dev->cnf.optimistic_dad && - !net->ipv6.devconf_all->forwarding && sllao) - addr_flags |= IFA_F_OPTIMISTIC; -#endif - - /* Do not allow to create too much of autoconfigured - * addresses; this would be too easy way to crash kernel. - */ - if (!max_addresses || - ipv6_count_addresses(in6_dev) < max_addresses) - ifp = ipv6_add_addr(in6_dev, &addr, NULL, - pinfo->prefix_len, - addr_type&IPV6_ADDR_SCOPE_MASK, - addr_flags, valid_lft, - prefered_lft); - - if (IS_ERR_OR_NULL(ifp)) { - in6_dev_put(in6_dev); - return; - } - - update_lft = 0; - create = 1; - spin_lock_bh(&ifp->lock); - ifp->flags |= IFA_F_MANAGETEMPADDR; - ifp->cstamp = jiffies; - ifp->tokenized = tokenized; - spin_unlock_bh(&ifp->lock); - addrconf_dad_start(ifp); - } - - if (ifp) { - u32 flags; - unsigned long now; - u32 stored_lft; - - /* update lifetime (RFC2462 5.5.3 e) */ - spin_lock_bh(&ifp->lock); - now = jiffies; - if (ifp->valid_lft > (now - ifp->tstamp) / HZ) - stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; - else - stored_lft = 0; - if (!update_lft && !create && stored_lft) { - const u32 minimum_lft = min_t(u32, - stored_lft, MIN_VALID_LIFETIME); - valid_lft = max(valid_lft, minimum_lft); - - /* RFC4862 Section 5.5.3e: - * "Note that the preferred lifetime of the - * corresponding address is always reset to - * the Preferred Lifetime in the received - * Prefix Information option, regardless of - * whether the valid lifetime is also reset or - * ignored." - * - * So we should always update prefered_lft here. - */ - update_lft = 1; - } - - if (update_lft) { - ifp->valid_lft = valid_lft; - ifp->prefered_lft = prefered_lft; - ifp->tstamp = now; - flags = ifp->flags; - ifp->flags &= ~IFA_F_DEPRECATED; - spin_unlock_bh(&ifp->lock); - - if (!(flags&IFA_F_TENTATIVE)) - ipv6_ifa_notify(0, ifp); - } else - spin_unlock_bh(&ifp->lock); - - manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft, - create, now); - - in6_ifa_put(ifp); - addrconf_verify(); - } + err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, + &addr, addr_type, + addr_flags, sllao, + tokenized, valid_lft, + prefered_lft); + if (err) + goto put; } inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); +put: in6_dev_put(in6_dev); } -- cgit v1.2.3 From f997c55c1dc8841b3ee4df0493d0ac7966d42165 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:23 +0200 Subject: ipv6: introduce neighbour discovery ops This patch introduces neighbour discovery ops callback structure. The idea is to separate the handling for 6LoWPAN into the 6lowpan module. These callback offers 6lowpan different handling, such as 802.15.4 short address handling or RFC6775 (Neighbor Discovery Optimization for IPv6 over 6LoWPANs). Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Acked-by: YOSHIFUJI Hideaki Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 13 ++++++- net/ipv6/ndisc.c | 101 +++++++++++++++++++++++++++++++++++----------------- net/ipv6/route.c | 8 ++--- 3 files changed, 85 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 44c1cefdb299..b6e9bdc610f2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2531,7 +2531,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (pinfo->autoconf && in6_dev->cnf.autoconf) { struct in6_addr addr; - bool tokenized = false; + bool tokenized = false, dev_addr_generated = false; if (pinfo->prefix_len == 64) { memcpy(&addr, &pinfo->prefix, 8); @@ -2550,6 +2550,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { goto put; + } else { + dev_addr_generated = true; } goto ok; } @@ -2565,6 +2567,15 @@ ok: prefered_lft); if (err) goto put; + + /* Ignore error case here because previous prefix add addr was + * successful which will be notified. + */ + ndisc_ops_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, &addr, + addr_type, addr_flags, sllao, + tokenized, valid_lft, + prefered_lft, + dev_addr_generated); } inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); put: diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a7b9468e684a..2f4afd17954b 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -172,10 +172,19 @@ static void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, } static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, - void *data) + void *data, u8 icmp6_type) { __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, ndisc_addr_option_pad(skb->dev->type)); + ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type); +} + +static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb, + void *ha, + const u8 *ops_data) +{ + ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT); + ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data); } static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, @@ -191,24 +200,28 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, return cur <= end && cur->nd_opt_type == type ? cur : NULL; } -static inline int ndisc_is_useropt(struct nd_opt_hdr *opt) +static inline int ndisc_is_useropt(const struct net_device *dev, + struct nd_opt_hdr *opt) { return opt->nd_opt_type == ND_OPT_RDNSS || - opt->nd_opt_type == ND_OPT_DNSSL; + opt->nd_opt_type == ND_OPT_DNSSL || + ndisc_ops_is_useropt(dev, opt->nd_opt_type); } -static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, +static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, + struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { if (!cur || !end || cur >= end) return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); - } while (cur < end && !ndisc_is_useropt(cur)); - return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; + } while (cur < end && !ndisc_is_useropt(dev, cur)); + return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL; } -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, +struct ndisc_options *ndisc_parse_options(const struct net_device *dev, + u8 *opt, int opt_len, struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; @@ -223,6 +236,8 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, l = nd_opt->nd_opt_len << 3; if (opt_len < l || l == 0) return NULL; + if (ndisc_ops_parse_options(dev, nd_opt, ndopts)) + goto next_opt; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: @@ -249,7 +264,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, break; #endif default: - if (ndisc_is_useropt(nd_opt)) { + if (ndisc_is_useropt(dev, nd_opt)) { ndopts->nd_useropts_end = nd_opt; if (!ndopts->nd_useropts) ndopts->nd_useropts = nd_opt; @@ -266,6 +281,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, nd_opt->nd_opt_len); } } +next_opt: opt_len -= l; nd_opt = ((void *)nd_opt) + l; } @@ -515,7 +531,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, if (!dev->addr_len) inc_opt = 0; if (inc_opt) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, + NDISC_NEIGHBOUR_ADVERTISEMENT); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -534,8 +551,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, - dev->dev_addr); - + dev->dev_addr, + NDISC_NEIGHBOUR_ADVERTISEMENT); ndisc_send_skb(skb, daddr, src_addr); } @@ -580,7 +597,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, if (ipv6_addr_any(saddr)) inc_opt = false; if (inc_opt) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, + NDISC_NEIGHBOUR_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -596,7 +614,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, - dev->dev_addr); + dev->dev_addr, + NDISC_NEIGHBOUR_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } @@ -632,7 +651,7 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, } #endif if (send_sllao) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -647,7 +666,8 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, if (send_sllao) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, - dev->dev_addr); + dev->dev_addr, + NDISC_ROUTER_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } @@ -708,6 +728,15 @@ static int pndisc_is_router(const void *pkey, return ret; } +void ndisc_update(const struct net_device *dev, struct neighbour *neigh, + const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, + struct ndisc_options *ndopts) +{ + neigh_update(neigh, lladdr, new, flags); + /* report ndisc ops about neighbour update */ + ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); +} + static void ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); @@ -744,7 +773,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) return; } - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND options\n"); return; } @@ -862,9 +891,10 @@ have_ifp: neigh = __neigh_lookup(&nd_tbl, saddr, dev, !inc || lladdr || !dev->addr_len); if (neigh) - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| - NEIGH_UPDATE_F_OVERRIDE); + NEIGH_UPDATE_F_OVERRIDE, + NDISC_NEIGHBOUR_SOLICITATION, &ndopts); if (neigh || !dev->header_ops) { ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); @@ -917,7 +947,7 @@ static void ndisc_recv_na(struct sk_buff *skb) idev->cnf.drop_unsolicited_na) return; - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND option\n"); return; } @@ -973,12 +1003,13 @@ static void ndisc_recv_na(struct sk_buff *skb) goto out; } - neigh_update(neigh, lladdr, + ndisc_update(dev, neigh, lladdr, msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0)); + (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0), + NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts); if ((old_flags & ~neigh->flags) & NTF_ROUTER) { /* @@ -1023,7 +1054,7 @@ static void ndisc_recv_rs(struct sk_buff *skb) goto out; /* Parse ND options */ - if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n"); goto out; } @@ -1037,10 +1068,11 @@ static void ndisc_recv_rs(struct sk_buff *skb) neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); if (neigh) { - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| - NEIGH_UPDATE_F_OVERRIDE_ISROUTER); + NEIGH_UPDATE_F_OVERRIDE_ISROUTER, + NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); } out: @@ -1141,7 +1173,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - if (!ndisc_parse_options(opt, optlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) { ND_PRINTK(2, warn, "RA: invalid ND options\n"); return; } @@ -1335,11 +1367,12 @@ skip_linkparms: goto out; } } - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - NEIGH_UPDATE_F_ISROUTER); + NEIGH_UPDATE_F_ISROUTER, + NDISC_ROUTER_ADVERTISEMENT, &ndopts); } if (!ipv6_accept_ra(in6_dev)) { @@ -1427,7 +1460,8 @@ skip_routeinfo: struct nd_opt_hdr *p; for (p = ndopts.nd_useropts; p; - p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) { + p = ndisc_next_useropt(skb->dev, p, + ndopts.nd_useropts_end)) { ndisc_ra_useropt(skb, p); } } @@ -1465,7 +1499,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) return; } - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) + if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) return; if (!ndopts.nd_opts_rh) { @@ -1510,7 +1544,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) struct dst_entry *dst; struct flowi6 fl6; int rd_len; - u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, + ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; int oif = l3mdev_fib_oif(dev); bool ret; @@ -1569,7 +1604,9 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_redirect_opt_addr_space(dev, neigh, + ops_data_buf, + &ops_data); } else read_unlock_bh(&neigh->lock); @@ -1600,7 +1637,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) */ if (ha) - ndisc_fill_addr_option(buff, ND_OPT_TARGET_LL_ADDR, ha); + ndisc_fill_redirect_addr_option(buff, ha, ops_data); /* * build redirect option and copy skb over to the new packet. diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d51a1a48b839..9e1516785dac 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2201,7 +2201,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * first-hop router for the specified ICMP Destination Address. */ - if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); return; } @@ -2236,12 +2236,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * We have finally decided to accept it. */ - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - NEIGH_UPDATE_F_ISROUTER)) - ); + NEIGH_UPDATE_F_ISROUTER)), + NDISC_REDIRECT, &ndopts); nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); if (!nrt) -- cgit v1.2.3 From cc84b3c6b48ae81748c5e25d3558872385196162 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:24 +0200 Subject: ipv6: export several functions This patch exports some neighbour discovery functions which can be used by 6lowpan neighbour discovery ops functionality then. Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Acked-by: YOSHIFUJI Hideaki Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 15 +++++++-------- net/ipv6/ndisc.c | 14 +++----------- 2 files changed, 10 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b6e9bdc610f2..6c8fc3f96b11 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2333,14 +2333,12 @@ static bool is_addr_mode_generate_stable(struct inet6_dev *idev) idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; } -static int addrconf_prefix_rcv_add_addr(struct net *net, - struct net_device *dev, - const struct prefix_info *pinfo, - struct inet6_dev *in6_dev, - const struct in6_addr *addr, - int addr_type, u32 addr_flags, - bool sllao, bool tokenized, - __u32 valid_lft, u32 prefered_lft) +int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + const struct in6_addr *addr, int addr_type, + u32 addr_flags, bool sllao, bool tokenized, + __u32 valid_lft, u32 prefered_lft) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1); int create = 0, update_lft = 0; @@ -2430,6 +2428,7 @@ static int addrconf_prefix_rcv_add_addr(struct net *net, return 0; } +EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr); void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 2f4afd17954b..fe65cdc28a45 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -73,15 +73,6 @@ #include #include -/* Set to 3 to get tracing... */ -#define ND_DEBUG 1 - -#define ND_PRINTK(val, level, fmt, ...) \ -do { \ - if (val <= ND_DEBUG) \ - net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ -} while (0) - static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); @@ -150,8 +141,8 @@ struct neigh_table nd_tbl = { }; EXPORT_SYMBOL_GPL(nd_tbl); -static void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, - int data_len, int pad) +void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, + int data_len, int pad) { int space = __ndisc_opt_addr_space(data_len, pad); u8 *opt = skb_put(skb, space); @@ -170,6 +161,7 @@ static void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, if (space > 0) memset(opt, 0, space); } +EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, u8 icmp6_type) -- cgit v1.2.3 From bbe5f5cefe2818eda0392c178de141ffc5734d90 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:25 +0200 Subject: 6lowpan: introduce 6lowpan-nd This patch introduce different 6lowpan handling for receive and transmit NS/NA messages for the ipv6 neighbour discovery. The first use-case is for supporting 802.15.4 short addresses inside the option fields and handling for RFC6775 6CO option field as userspace option. Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Reviewed-by: Stefan Schmidt Acked-by: YOSHIFUJI Hideaki Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/6lowpan/6lowpan_i.h | 4 + net/6lowpan/Makefile | 2 +- net/6lowpan/core.c | 4 +- net/6lowpan/ndisc.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 242 insertions(+), 2 deletions(-) create mode 100644 net/6lowpan/ndisc.c (limited to 'net') diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h index 97ecc27aeca6..a67caee11929 100644 --- a/net/6lowpan/6lowpan_i.h +++ b/net/6lowpan/6lowpan_i.h @@ -12,6 +12,10 @@ static inline bool lowpan_is_ll(const struct net_device *dev, return lowpan_dev(dev)->lltype == lltype; } +extern const struct ndisc_ops lowpan_ndisc_ops; + +int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev); + #ifdef CONFIG_6LOWPAN_DEBUGFS int lowpan_dev_debugfs_init(struct net_device *dev); void lowpan_dev_debugfs_exit(struct net_device *dev); diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile index e44f3bf2dd42..12d131ab2324 100644 --- a/net/6lowpan/Makefile +++ b/net/6lowpan/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_6LOWPAN) += 6lowpan.o -6lowpan-y := core.o iphc.o nhc.o +6lowpan-y := core.o iphc.o nhc.o ndisc.o 6lowpan-$(CONFIG_6LOWPAN_DEBUGFS) += debugfs.o #rfc6282 nhcs diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 1c7a42b48524..5945f7e19c67 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -34,6 +34,8 @@ int lowpan_register_netdevice(struct net_device *dev, for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) lowpan_dev(dev)->ctx.table[i].id = i; + dev->ndisc_ops = &lowpan_ndisc_ops; + ret = register_netdevice(dev); if (ret < 0) return ret; @@ -73,7 +75,7 @@ void lowpan_unregister_netdev(struct net_device *dev) } EXPORT_SYMBOL(lowpan_unregister_netdev); -static int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) +int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) { struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c new file mode 100644 index 000000000000..ae1d4199aa4c --- /dev/null +++ b/net/6lowpan/ndisc.c @@ -0,0 +1,234 @@ +/* This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: + * (C) 2016 Pengutronix, Alexander Aring + */ + +#include +#include +#include + +#include "6lowpan_i.h" + +static int lowpan_ndisc_is_useropt(u8 nd_opt_type) +{ + return nd_opt_type == ND_OPT_6CO; +} + +#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) +#define NDISC_802154_SHORT_ADDR_LENGTH 1 +static int lowpan_ndisc_parse_802154_options(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts) +{ + switch (nd_opt->nd_opt_len) { + case NDISC_802154_SHORT_ADDR_LENGTH: + if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type]) + ND_PRINTK(2, warn, + "%s: duplicated short addr ND6 option found: type=%d\n", + __func__, nd_opt->nd_opt_type); + else + ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt; + return 1; + default: + /* all others will be handled by ndisc IPv6 option parsing */ + return 0; + } +} + +static int lowpan_ndisc_parse_options(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts) +{ + switch (nd_opt->nd_opt_type) { + case ND_OPT_SOURCE_LL_ADDR: + case ND_OPT_TARGET_LL_ADDR: + return lowpan_ndisc_parse_802154_options(dev, nd_opt, ndopts); + default: + return 0; + } +} + +static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags, + u8 icmp6_type, + const struct ndisc_options *ndopts) +{ + struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); + u8 *lladdr_short = NULL; + + switch (icmp6_type) { + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + if (ndopts->nd_802154_opts_src_lladdr) { + lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr, + IEEE802154_SHORT_ADDR_LEN, 0); + if (!lladdr_short) { + ND_PRINTK(2, warn, + "NA: invalid short link-layer address length\n"); + return; + } + } + break; + case NDISC_REDIRECT: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + if (ndopts->nd_802154_opts_tgt_lladdr) { + lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr, + IEEE802154_SHORT_ADDR_LEN, 0); + if (!lladdr_short) { + ND_PRINTK(2, warn, + "NA: invalid short link-layer address length\n"); + return; + } + } + break; + default: + break; + } + + write_lock_bh(&n->lock); + if (lladdr_short) + ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short); + else + neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + write_unlock_bh(&n->lock); +} + +static void lowpan_ndisc_update(const struct net_device *dev, + struct neighbour *n, u32 flags, u8 icmp6_type, + const struct ndisc_options *ndopts) +{ + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return; + + /* react on overrides only. TODO check if this is really right. */ + if (flags & NEIGH_UPDATE_F_OVERRIDE) + lowpan_ndisc_802154_update(n, flags, icmp6_type, ndopts); +} + +static int lowpan_ndisc_opt_addr_space(const struct net_device *dev, + u8 icmp6_type, struct neighbour *neigh, + u8 *ha_buf, u8 **ha) +{ + struct lowpan_802154_neigh *n; + struct wpan_dev *wpan_dev; + int addr_space = 0; + + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return 0; + + switch (icmp6_type) { + case NDISC_REDIRECT: + n = lowpan_802154_neigh(neighbour_priv(neigh)); + + read_lock_bh(&neigh->lock); + if (lowpan_802154_is_valid_src_short_addr(n->short_addr)) { + memcpy(ha_buf, &n->short_addr, + IEEE802154_SHORT_ADDR_LEN); + read_unlock_bh(&neigh->lock); + addr_space += __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0); + *ha = ha_buf; + } + read_unlock_bh(&neigh->lock); + break; + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_ROUTER_SOLICITATION: + wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) + addr_space = __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0); + break; + default: + break; + } + + return addr_space; +} + +static void lowpan_ndisc_fill_addr_option(const struct net_device *dev, + struct sk_buff *skb, u8 icmp6_type, + const u8 *ha) +{ + struct wpan_dev *wpan_dev; + __be16 short_addr; + u8 opt_type; + + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return; + + switch (icmp6_type) { + case NDISC_REDIRECT: + if (ha) { + ieee802154_le16_to_be16(&short_addr, ha); + __ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, + &short_addr, + IEEE802154_SHORT_ADDR_LEN, 0); + } + return; + case NDISC_NEIGHBOUR_ADVERTISEMENT: + opt_type = ND_OPT_TARGET_LL_ADDR; + break; + case NDISC_ROUTER_SOLICITATION: + case NDISC_NEIGHBOUR_SOLICITATION: + opt_type = ND_OPT_SOURCE_LL_ADDR; + break; + default: + return; + } + + wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) { + ieee802154_le16_to_be16(&short_addr, + &wpan_dev->short_addr); + __ndisc_fill_addr_option(skb, opt_type, &short_addr, + IEEE802154_SHORT_ADDR_LEN, 0); + } +} + +static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net, + struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + struct in6_addr *addr, + int addr_type, u32 addr_flags, + bool sllao, bool tokenized, + __u32 valid_lft, + u32 prefered_lft, + bool dev_addr_generated) +{ + int err; + + /* generates short based address for RA PIO's */ + if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && dev_addr_generated && + !addrconf_ifid_802154_6lowpan(addr->s6_addr + 8, dev)) { + err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, + addr, addr_type, addr_flags, + sllao, tokenized, valid_lft, + prefered_lft); + if (err) + ND_PRINTK(2, warn, + "RA: could not add a short address based address for prefix: %pI6c\n", + &pinfo->prefix); + } +} +#endif + +const struct ndisc_ops lowpan_ndisc_ops = { + .is_useropt = lowpan_ndisc_is_useropt, +#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) + .parse_options = lowpan_ndisc_parse_options, + .update = lowpan_ndisc_update, + .opt_addr_space = lowpan_ndisc_opt_addr_space, + .fill_addr_option = lowpan_ndisc_fill_addr_option, + .prefix_rcv_add_addr = lowpan_ndisc_prefix_rcv_add_addr, +#endif +}; -- cgit v1.2.3 From cfce94653dad2d0661e1926c028ce63052eb20cd Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:26 +0200 Subject: 6lowpan: add support for getting short address In case of sending RA messages we need some way to get the short address from an 802.15.4 6LoWPAN interface. This patch will add a temporary debugfs entry for experimental userspace api. Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/6lowpan/debugfs.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'net') diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index acbaa3db493b..24915e0bb9ea 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -245,6 +245,41 @@ static const struct file_operations lowpan_context_fops = { .release = single_release, }; +static int lowpan_short_addr_get(void *data, u64 *val) +{ + struct wpan_dev *wdev = data; + + rtnl_lock(); + *val = le16_to_cpu(wdev->short_addr); + rtnl_unlock(); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(lowpan_short_addr_fops, lowpan_short_addr_get, + NULL, "0x%04llx\n"); + +static int lowpan_dev_debugfs_802154_init(const struct net_device *dev, + struct lowpan_dev *ldev) +{ + struct dentry *dentry, *root; + + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return 0; + + root = debugfs_create_dir("ieee802154", ldev->iface_debugfs); + if (!root) + return -EINVAL; + + dentry = debugfs_create_file("short_addr", 0444, root, + lowpan_802154_dev(dev)->wdev->ieee802154_ptr, + &lowpan_short_addr_fops); + if (!dentry) + return -EINVAL; + + return 0; +} + int lowpan_dev_debugfs_init(struct net_device *dev) { struct lowpan_dev *ldev = lowpan_dev(dev); @@ -272,6 +307,10 @@ int lowpan_dev_debugfs_init(struct net_device *dev) goto remove_root; } + ret = lowpan_dev_debugfs_802154_init(dev, ldev); + if (ret < 0) + goto remove_root; + return 0; remove_root: -- cgit v1.2.3 From eab560e58208730ec47e1e0461b8db1049d5d176 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:27 +0200 Subject: 6lowpan: add support for 802.15.4 short addr handling This patch adds necessary handling for use the short address for 802.15.4 6lowpan. It contains support for IPHC address compression and new matching algorithmn to decide which link layer address will be used for 802.15.4 frame. Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/6lowpan/iphc.c | 167 ++++++++++++++++++++++++++++++++++++-------- net/ieee802154/6lowpan/tx.c | 113 +++++++++++++++--------------- 2 files changed, 195 insertions(+), 85 deletions(-) (limited to 'net') diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 8501dd532fe1..79f1fa22509a 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -761,22 +761,75 @@ static const u8 lowpan_iphc_dam_to_sam_value[] = { [LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11, }; -static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr, +static inline bool +lowpan_iphc_compress_ctx_802154_lladdr(const struct in6_addr *ipaddr, + const struct lowpan_iphc_ctx *ctx, + const void *lladdr) +{ + const struct ieee802154_addr *addr = lladdr; + unsigned char extended_addr[EUI64_ADDR_LEN]; + bool lladdr_compress = false; + struct in6_addr tmp = {}; + + switch (addr->mode) { + case IEEE802154_ADDR_LONG: + ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr); + /* check for SAM/DAM = 11 */ + memcpy(&tmp.s6_addr[8], &extended_addr, EUI64_ADDR_LEN); + /* second bit-flip (Universe/Local) is done according RFC2464 */ + tmp.s6_addr[8] ^= 0x02; + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) + lladdr_compress = true; + break; + case IEEE802154_ADDR_SHORT: + tmp.s6_addr[11] = 0xFF; + tmp.s6_addr[12] = 0xFE; + ieee802154_le16_to_be16(&tmp.s6_addr16[7], + &addr->short_addr); + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) + lladdr_compress = true; + break; + default: + /* should never handled and filtered by 802154 6lowpan */ + WARN_ON_ONCE(1); + break; + } + + return lladdr_compress; +} + +static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct net_device *dev, + const struct in6_addr *ipaddr, const struct lowpan_iphc_ctx *ctx, const unsigned char *lladdr, bool sam) { struct in6_addr tmp = {}; u8 dam; - /* check for SAM/DAM = 11 */ - memcpy(&tmp.s6_addr[8], lladdr, 8); - /* second bit-flip (Universe/Local) is done according RFC2464 */ - tmp.s6_addr[8] ^= 0x02; - /* context information are always used */ - ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); - if (ipv6_addr_equal(&tmp, ipaddr)) { - dam = LOWPAN_IPHC_DAM_11; - goto out; + switch (lowpan_dev(dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + if (lowpan_iphc_compress_ctx_802154_lladdr(ipaddr, ctx, + lladdr)) { + dam = LOWPAN_IPHC_DAM_11; + goto out; + } + break; + default: + /* check for SAM/DAM = 11 */ + memcpy(&tmp.s6_addr[8], lladdr, EUI64_ADDR_LEN); + /* second bit-flip (Universe/Local) is done according RFC2464 */ + tmp.s6_addr[8] ^= 0x02; + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) { + dam = LOWPAN_IPHC_DAM_11; + goto out; + } + break; } memset(&tmp, 0, sizeof(tmp)); @@ -813,28 +866,85 @@ out: return dam; } -static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr, +static inline bool +lowpan_iphc_compress_802154_lladdr(const struct in6_addr *ipaddr, + const void *lladdr) +{ + const struct ieee802154_addr *addr = lladdr; + unsigned char extended_addr[EUI64_ADDR_LEN]; + bool lladdr_compress = false; + struct in6_addr tmp = {}; + + switch (addr->mode) { + case IEEE802154_ADDR_LONG: + ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr); + if (is_addr_mac_addr_based(ipaddr, extended_addr)) + lladdr_compress = true; + break; + case IEEE802154_ADDR_SHORT: + /* fe:80::ff:fe00:XXXX + * \__/ + * short_addr + * + * Universe/Local bit is zero. + */ + tmp.s6_addr[0] = 0xFE; + tmp.s6_addr[1] = 0x80; + tmp.s6_addr[11] = 0xFF; + tmp.s6_addr[12] = 0xFE; + ieee802154_le16_to_be16(&tmp.s6_addr16[7], + &addr->short_addr); + if (ipv6_addr_equal(&tmp, ipaddr)) + lladdr_compress = true; + break; + default: + /* should never handled and filtered by 802154 6lowpan */ + WARN_ON_ONCE(1); + break; + } + + return lladdr_compress; +} + +static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct net_device *dev, + const struct in6_addr *ipaddr, const unsigned char *lladdr, bool sam) { - u8 dam = LOWPAN_IPHC_DAM_00; + u8 dam = LOWPAN_IPHC_DAM_01; - if (is_addr_mac_addr_based(ipaddr, lladdr)) { - dam = LOWPAN_IPHC_DAM_11; /* 0-bits */ - pr_debug("address compression 0 bits\n"); - } else if (lowpan_is_iid_16_bit_compressable(ipaddr)) { + switch (lowpan_dev(dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + if (lowpan_iphc_compress_802154_lladdr(ipaddr, lladdr)) { + dam = LOWPAN_IPHC_DAM_11; /* 0-bits */ + pr_debug("address compression 0 bits\n"); + goto out; + } + break; + default: + if (is_addr_mac_addr_based(ipaddr, lladdr)) { + dam = LOWPAN_IPHC_DAM_11; /* 0-bits */ + pr_debug("address compression 0 bits\n"); + goto out; + } + break; + } + + if (lowpan_is_iid_16_bit_compressable(ipaddr)) { /* compress IID to 16 bits xxxx::XXXX */ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[7], 2); dam = LOWPAN_IPHC_DAM_10; /* 16-bits */ raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)", *hc_ptr - 2, 2); - } else { - /* do not compress IID => xxxx::IID */ - lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8); - dam = LOWPAN_IPHC_DAM_01; /* 64-bits */ - raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)", - *hc_ptr - 8, 8); + goto out; } + /* do not compress IID => xxxx::IID */ + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8); + raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)", + *hc_ptr - 8, 8); + +out: + if (sam) return lowpan_iphc_dam_to_sam_value[dam]; else @@ -1013,9 +1123,6 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, iphc0 = LOWPAN_DISPATCH_IPHC; iphc1 = 0; - raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN); - raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN); - raw_dump_table(__func__, "sending raw skb network uncompressed packet", skb->data, skb->len); @@ -1088,14 +1195,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, iphc1 |= LOWPAN_IPHC_SAC; } else { if (sci) { - iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->saddr, + iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev, + &hdr->saddr, &sci_entry, saddr, true); iphc1 |= LOWPAN_IPHC_SAC; } else { if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL && lowpan_is_linklocal_zero_padded(hdr->saddr)) { - iphc1 |= lowpan_compress_addr_64(&hc_ptr, + iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev, &hdr->saddr, saddr, true); pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n", @@ -1123,14 +1231,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, } } else { if (dci) { - iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->daddr, + iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev, + &hdr->daddr, &dci_entry, daddr, false); iphc1 |= LOWPAN_IPHC_DAC; } else { if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL && lowpan_is_linklocal_zero_padded(hdr->daddr)) { - iphc1 |= lowpan_compress_addr_64(&hc_ptr, + iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev, &hdr->daddr, daddr, false); pr_debug("dest address unicast link-local %pI6c iphc1 0x%02x\n", diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index e459afd16bb3..dbb476d7d38f 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -9,6 +9,7 @@ */ #include +#include #include #include @@ -17,19 +18,9 @@ #define LOWPAN_FRAG1_HEAD_SIZE 0x4 #define LOWPAN_FRAGN_HEAD_SIZE 0x5 -/* don't save pan id, it's intra pan */ -struct lowpan_addr { - u8 mode; - union { - /* IPv6 needs big endian here */ - __be64 extended_addr; - __be16 short_addr; - } u; -}; - struct lowpan_addr_info { - struct lowpan_addr daddr; - struct lowpan_addr saddr; + struct ieee802154_addr daddr; + struct ieee802154_addr saddr; }; static inline struct @@ -48,12 +39,14 @@ lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb) * RAW/DGRAM sockets. */ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev, - unsigned short type, const void *_daddr, - const void *_saddr, unsigned int len) + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { - const u8 *saddr = _saddr; - const u8 *daddr = _daddr; - struct lowpan_addr_info *info; + struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr; + struct lowpan_addr_info *info = lowpan_skb_priv(skb); + struct lowpan_802154_neigh *llneigh = NULL; + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct neighbour *n; /* TODO: * if this package isn't ipv6 one, where should it be routed? @@ -61,21 +54,50 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev, if (type != ETH_P_IPV6) return 0; - if (!saddr) - saddr = ldev->dev_addr; + /* intra-pan communication */ + info->saddr.pan_id = wpan_dev->pan_id; + info->daddr.pan_id = info->saddr.pan_id; - raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8); - raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8); + if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) { + info->daddr.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); + info->daddr.mode = IEEE802154_ADDR_SHORT; + } else { + __le16 short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + + n = neigh_lookup(&nd_tbl, &hdr->daddr, ldev); + if (n) { + llneigh = lowpan_802154_neigh(neighbour_priv(n)); + read_lock_bh(&n->lock); + short_addr = llneigh->short_addr; + read_unlock_bh(&n->lock); + } - info = lowpan_skb_priv(skb); + if (llneigh && + lowpan_802154_is_valid_src_short_addr(short_addr)) { + info->daddr.short_addr = short_addr; + info->daddr.mode = IEEE802154_ADDR_SHORT; + } else { + info->daddr.mode = IEEE802154_ADDR_LONG; + ieee802154_be64_to_le64(&info->daddr.extended_addr, + daddr); + } - /* TODO: Currently we only support extended_addr */ - info->daddr.mode = IEEE802154_ADDR_LONG; - memcpy(&info->daddr.u.extended_addr, daddr, - sizeof(info->daddr.u.extended_addr)); - info->saddr.mode = IEEE802154_ADDR_LONG; - memcpy(&info->saddr.u.extended_addr, saddr, - sizeof(info->daddr.u.extended_addr)); + if (n) + neigh_release(n); + } + + if (!saddr) { + if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) { + info->saddr.mode = IEEE802154_ADDR_SHORT; + info->saddr.short_addr = wpan_dev->short_addr; + } else { + info->saddr.mode = IEEE802154_ADDR_LONG; + info->saddr.extended_addr = wpan_dev->extended_addr; + } + } else { + info->saddr.mode = IEEE802154_ADDR_LONG; + ieee802154_be64_to_le64(&info->saddr.extended_addr, saddr); + } return 0; } @@ -209,47 +231,26 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, u16 *dgram_size, u16 *dgram_offset) { struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr; - struct ieee802154_addr sa, da; struct ieee802154_mac_cb *cb = mac_cb_init(skb); struct lowpan_addr_info info; - void *daddr, *saddr; memcpy(&info, lowpan_skb_priv(skb), sizeof(info)); - /* TODO: Currently we only support extended_addr */ - daddr = &info.daddr.u.extended_addr; - saddr = &info.saddr.u.extended_addr; - *dgram_size = skb->len; - lowpan_header_compress(skb, ldev, daddr, saddr); + lowpan_header_compress(skb, ldev, &info.daddr, &info.saddr); /* dgram_offset = (saved bytes after compression) + lowpan header len */ *dgram_offset = (*dgram_size - skb->len) + skb_network_header_len(skb); cb->type = IEEE802154_FC_TYPE_DATA; - /* prepare wpan address data */ - sa.mode = IEEE802154_ADDR_LONG; - sa.pan_id = wpan_dev->pan_id; - sa.extended_addr = ieee802154_devaddr_from_raw(saddr); - - /* intra-PAN communications */ - da.pan_id = sa.pan_id; - - /* if the destination address is the broadcast address, use the - * corresponding short address - */ - if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) { - da.mode = IEEE802154_ADDR_SHORT; - da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); + if (info.daddr.mode == IEEE802154_ADDR_SHORT && + ieee802154_is_broadcast_short_addr(info.daddr.short_addr)) cb->ackreq = false; - } else { - da.mode = IEEE802154_ADDR_LONG; - da.extended_addr = ieee802154_devaddr_from_raw(daddr); + else cb->ackreq = wpan_dev->ackreq; - } - return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev, &da, - &sa, 0); + return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev, + &info.daddr, &info.saddr, 0); } netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) -- cgit v1.2.3 From 22a59be8b7693eb2d0897a9638f5991f2f8e4ddd Mon Sep 17 00:00:00 2001 From: Philip Prindeville Date: Tue, 14 Jun 2016 15:53:02 -0600 Subject: net: ipv4: Add ability to have GRE ignore DF bit in IPv4 payloads In the presence of firewalls which improperly block ICMP Unreachable (including Fragmentation Required) messages, Path MTU Discovery is prevented from working. A workaround is to handle IPv4 payloads opaquely, ignoring the DF bit--as is done for other payloads like AppleTalk--and doing transparent fragmentation and reassembly. Redux includes the enforcement of mutual exclusion between this feature and Path MTU Discovery as suggested by Alexander Duyck. Cc: Alexander Duyck Reviewed-by: Stephen Hemminger Signed-off-by: Philip Prindeville Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 42 +++++++++++++++++++++++++++++++++--------- net/ipv4/ip_tunnel.c | 2 +- 2 files changed, 34 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4d2025f7ec57..0f8ca3fca00a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -841,17 +841,19 @@ out: return ipgre_tunnel_validate(tb, data); } -static void ipgre_netlink_parms(struct net_device *dev, +static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], struct ip_tunnel_parm *parms) { + struct ip_tunnel *t = netdev_priv(dev); + memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_GRE; if (!data) - return; + return 0; if (data[IFLA_GRE_LINK]) parms->link = nla_get_u32(data[IFLA_GRE_LINK]); @@ -880,16 +882,26 @@ static void ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); - if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) + if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) { + if (t->ignore_df) + return -EINVAL; parms->iph.frag_off = htons(IP_DF); + } if (data[IFLA_GRE_COLLECT_METADATA]) { - struct ip_tunnel *t = netdev_priv(dev); - t->collect_md = true; if (dev->type == ARPHRD_IPGRE) dev->type = ARPHRD_NONE; } + + if (data[IFLA_GRE_IGNORE_DF]) { + if (nla_get_u8(data[IFLA_GRE_IGNORE_DF]) + && (parms->iph.frag_off & htons(IP_DF))) + return -EINVAL; + t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]); + } + + return 0; } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -960,16 +972,19 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p); + if (err < 0) + return err; return ip_tunnel_newlink(dev, tb, &p); } @@ -978,16 +993,19 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p); + if (err < 0) + return err; return ip_tunnel_changelink(dev, tb, &p); } @@ -1024,6 +1042,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_COLLECT_METADATA */ nla_total_size(0) + + /* IFLA_GRE_IGNORE_DF */ + nla_total_size(1) + 0; } @@ -1057,6 +1077,9 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df)) + goto nla_put_failure; + if (t->collect_md) { if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) goto nla_put_failure; @@ -1084,6 +1107,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d8f5e0a269f5..95649ebd2874 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -682,7 +682,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP)) + if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) df |= (inner_iph->frag_off&htons(IP_DF)); max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) -- cgit v1.2.3 From 0b797c85894f9de091de2da16bc2ce80842898c0 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 14 Jun 2016 16:29:15 -0700 Subject: ila: Fix checksum neutral mapping The algorithm for checksum neutral mapping is incorrect. This problem was being hidden since we were previously always performing checksum offload on the translated addresses and only with IPv6 HW csum. Enabling an ILA router shows the issue. Corrected algorithm: old_loc is the original locator in the packet, new_loc is the value to overwrite with and is found in the lookup table. old_flag is the old flag value (zero of CSUM_NEUTRAL_FLAG) and new_flag is then (old_flag ^ CSUM_NEUTRAL_FLAG) & CSUM_NEUTRAL_FLAG. Need SUM(new_id + new_flag + diff) == SUM(old_id + old_flag) for checksum neutral translation. Solving for diff gives: diff = (old_id - new_id) + (old_flag - new_flag) compute_csum_diff8(new_id, old_id) gives old_id - new_id If old_flag is set old_flag - new_flag = old_flag = CSUM_NEUTRAL_FLAG Else old_flag - new_flag = -new_flag = ~CSUM_NEUTRAL_FLAG Tested: - Implemented a user space program that creates random addresses and random locators to overwrite. Compares the checksum over the address before and after translation (must always be equal) - Enabled ILA router and showed proper operation. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ila/ila_common.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index b3d00be484d4..ec9efbcdad35 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -34,12 +34,12 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr, if (p->locator_match.v64) { diff = p->csum_diff; } else { - diff = compute_csum_diff8((__be32 *)iaddr, - (__be32 *)&p->locator); + diff = compute_csum_diff8((__be32 *)&p->locator, + (__be32 *)iaddr); } fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ? - ~CSUM_NEUTRAL_FLAG : CSUM_NEUTRAL_FLAG); + CSUM_NEUTRAL_FLAG : ~CSUM_NEUTRAL_FLAG); diff = csum_add(diff, fval); @@ -140,8 +140,8 @@ void ila_init_saved_csum(struct ila_params *p) return; p->csum_diff = compute_csum_diff8( - (__be32 *)&p->locator_match, - (__be32 *)&p->locator); + (__be32 *)&p->locator, + (__be32 *)&p->locator_match); } static int __init ila_init(void) -- cgit v1.2.3 From 141ddefce7c807c5e34b67be50b4a789a51f4a56 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 16 Jun 2016 01:15:06 +0800 Subject: sctp: change sk state to CLOSED instead of CLOSING in sctp_sock_migrate Commit d46e416c11c8 ("sctp: sctp should change socket state when shutdown is received") may set sk_state CLOSING in sctp_sock_migrate, but inet_accept doesn't allow the sk_state other than ESTABLISHED/ CLOSED for sctp. So we will change sk_state to CLOSED, instead of CLOSING, as actually sk is closed already there. Fixes: d46e416c11c8 ("sctp: sctp should change socket state when shutdown is received") Reported-by: Ye Xiaolong Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6cae4c61ae26..cdabbd8219b1 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7568,7 +7568,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, * is called, set RCV_SHUTDOWN flag. */ if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) { - newsk->sk_state = SCTP_SS_CLOSING; + newsk->sk_state = SCTP_SS_CLOSED; newsk->sk_shutdown |= RCV_SHUTDOWN; } else { newsk->sk_state = SCTP_SS_ESTABLISHED; -- cgit v1.2.3 From 0d227a8672c83f2153a0eeeb5439e3b7185c3d9c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 16 Jun 2016 17:09:09 +0900 Subject: mpls: allow routes on ipgre devices This appears to be necessary and sufficient to provide MPLS in GRE (RFC4023) support. This can be used by establishing an ipgre tunnel device and then routing MPLS over it. The following example will forward MPLS frames received with an outermost MPLS label 100 over tun1, a GRE tunnel. The forwarded packet will have the outermost MPLS LSE removed and two new LSEs added with labels 200 (outermost) and 300 (next). ip link add name tun1 type gre remote 10.0.99.193 local 10.0.99.192 ttl 225 ip link set up dev tun1 ip addr add 10.0.98.192/24 dev tun1 ip route sh echo 1 > /proc/sys/net/mpls/conf/eth0/input echo 101 > /proc/sys/net/mpls/platform_labels ip -f mpls route add 100 as 200/300 via inet 10.0.98.193 ip -f mpls route sh Also remove unnecessary braces. Reviewed-by: Dinan Gunawardena Signed-off-by: Simon Horman Acked-by: Robert Shearman Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 7a4aa3450dd7..e9beaa58573c 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1009,9 +1009,10 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, unsigned int flags; if (event == NETDEV_REGISTER) { - /* For now just support ethernet devices */ - if ((dev->type == ARPHRD_ETHER) || - (dev->type == ARPHRD_LOOPBACK)) { + /* For now just support Ethernet and IPGRE devices */ + if (dev->type == ARPHRD_ETHER || + dev->type == ARPHRD_LOOPBACK || + dev->type == ARPHRD_IPGRE) { mdev = mpls_add_dev(dev); if (IS_ERR(mdev)) return notifier_from_errno(PTR_ERR(mdev)); -- cgit v1.2.3 From 84d15ae57d9478efc755306fee5ee562e0fa40e5 Mon Sep 17 00:00:00 2001 From: Wei Tang Date: Thu, 16 Jun 2016 21:17:49 +0800 Subject: net: do not initialise statics to 0 This patch fixes the checkpatch.pl error to dev.c: ERROR: do not initialise statics to 0 Signed-off-by: Wei Tang Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index b14835757141..441657f05e98 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2422,7 +2422,7 @@ EXPORT_SYMBOL(__skb_tx_hash); static void skb_warn_bad_offload(const struct sk_buff *skb) { - static const netdev_features_t null_features = 0; + static const netdev_features_t null_features; struct net_device *dev = skb->dev; const char *name = ""; -- cgit v1.2.3 From be4da0e340ed2a17b1a55cbe81d6bc251710ff72 Mon Sep 17 00:00:00 2001 From: Wei Tang Date: Thu, 16 Jun 2016 21:30:12 +0800 Subject: net: the space is required after ',' The space is missing after ',', and this will introduce much more noise when checking patch around. Signed-off-by: Wei Tang Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 441657f05e98..d40593b3b9fb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5911,7 +5911,7 @@ static void netdev_adjacent_add_links(struct net_device *dev) struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); @@ -5920,7 +5920,7 @@ static void netdev_adjacent_add_links(struct net_device *dev) } list_for_each_entry(iter, &dev->adj_list.lower, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); @@ -5936,7 +5936,7 @@ static void netdev_adjacent_del_links(struct net_device *dev) struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.lower); @@ -5945,7 +5945,7 @@ static void netdev_adjacent_del_links(struct net_device *dev) } list_for_each_entry(iter, &dev->adj_list.lower, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.upper); @@ -5961,7 +5961,7 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.lower); @@ -5970,7 +5970,7 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) } list_for_each_entry(iter, &dev->adj_list.lower, list) { - if (!net_eq(net,dev_net(iter->dev))) + if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.upper); -- cgit v1.2.3 From 318d3cc04e8e42b3b138f7dea2297290636fad7d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 16 Jun 2016 15:59:25 +0200 Subject: net: xfrm: fix old-style declaration Modern C standards expect the '__inline__' keyword to come before the return type in a declaration, and we get a couple of warnings for this with "make W=1" in the xfrm{4,6}_policy.c files: net/ipv6/xfrm6_policy.c:369:1: error: 'inline' is not at beginning of declaration [-Werror=old-style-declaration] static int inline xfrm6_net_sysctl_init(struct net *net) net/ipv6/xfrm6_policy.c:374:1: error: 'inline' is not at beginning of declaration [-Werror=old-style-declaration] static void inline xfrm6_net_sysctl_exit(struct net *net) net/ipv4/xfrm4_policy.c:339:1: error: 'inline' is not at beginning of declaration [-Werror=old-style-declaration] static int inline xfrm4_net_sysctl_init(struct net *net) net/ipv4/xfrm4_policy.c:344:1: error: 'inline' is not at beginning of declaration [-Werror=old-style-declaration] static void inline xfrm4_net_sysctl_exit(struct net *net) Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/ipv4/xfrm4_policy.c | 8 ++++---- net/ipv6/xfrm6_policy.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7b0edb37a115..b644a23c3db0 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -295,7 +295,7 @@ static struct ctl_table xfrm4_policy_table[] = { { } }; -static int __net_init xfrm4_net_sysctl_init(struct net *net) +static __net_init int xfrm4_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -323,7 +323,7 @@ err_alloc: return -ENOMEM; } -static void __net_exit xfrm4_net_sysctl_exit(struct net *net) +static __net_exit void xfrm4_net_sysctl_exit(struct net *net) { struct ctl_table *table; @@ -336,12 +336,12 @@ static void __net_exit xfrm4_net_sysctl_exit(struct net *net) kfree(table); } #else /* CONFIG_SYSCTL */ -static int inline xfrm4_net_sysctl_init(struct net *net) +static inline int xfrm4_net_sysctl_init(struct net *net) { return 0; } -static void inline xfrm4_net_sysctl_exit(struct net *net) +static inline void xfrm4_net_sysctl_exit(struct net *net) { } #endif diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index c074771a10f7..6cc97003e4a9 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -366,12 +366,12 @@ static void __net_exit xfrm6_net_sysctl_exit(struct net *net) kfree(table); } #else /* CONFIG_SYSCTL */ -static int inline xfrm6_net_sysctl_init(struct net *net) +static inline int xfrm6_net_sysctl_init(struct net *net) { return 0; } -static void inline xfrm6_net_sysctl_exit(struct net *net) +static inline void xfrm6_net_sysctl_exit(struct net *net) { } #endif -- cgit v1.2.3 From 9f6ed032cd951d2427995578c51af1b50c054efa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Jun 2016 23:19:29 +0200 Subject: net, cls: also reject deleting all filters when TCA_KIND present When we check for RTM_DELTFILTER, we should also reject the request for deleting all filters under a given parent when TCA_KIND attribute is present. If present, it's currently just ignored but there's also no point to let it pass in the first place either since this doesn't have any meaning with wild-card removal. Fixes: ea7f8277f907 ("net, cls: allow for deleting all filters for given parent") Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index cca1ef5e5476..843a716a4303 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -169,7 +169,7 @@ replay: if (prio == 0) { switch (n->nlmsg_type) { case RTM_DELTFILTER: - if (protocol || t->tcm_handle) + if (protocol || t->tcm_handle || tca[TCA_KIND]) return -ENOENT; break; case RTM_NEWTFILTER: -- cgit v1.2.3 From a20fadf85312f7e999c7279af3e038e4f3539fbf Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 23 May 2016 15:13:00 +0200 Subject: can: build proc support only if CONFIG_PROC_FS is activated When building can subsystem with CONFIG_PROC_FS=n I detected some unused variables warning by using proc functions. In CAN the proc handling is nicely placed in one object file. This patch adds simple add a dependency on CONFIG_PROC_FS for CAN's proc.o file and corresponding static inline no-op functions. Signed-off-by: Alexander Aring Acked-by: Oliver Hartkopp [mkl: provide static inline noops instead of using #ifdefs] Signed-off-by: Marc Kleine-Budde --- net/can/Makefile | 3 ++- net/can/af_can.h | 11 +++++++++++ net/can/proc.c | 3 +-- 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/can/Makefile b/net/can/Makefile index cef49eb1f5c7..10936754e3f2 100644 --- a/net/can/Makefile +++ b/net/can/Makefile @@ -3,7 +3,8 @@ # obj-$(CONFIG_CAN) += can.o -can-y := af_can.o proc.o +can-y := af_can.o +can-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_CAN_RAW) += can-raw.o can-raw-y := raw.o diff --git a/net/can/af_can.h b/net/can/af_can.h index fca0fe9fc45a..38a79ff20022 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -113,8 +113,19 @@ struct s_pstats { extern struct dev_rcv_lists can_rx_alldev_list; /* function prototypes for the CAN networklayer procfs (proc.c) */ +#ifdef CONFIG_PROC_FS void can_init_proc(void); void can_remove_proc(void); +#else +static inline void can_init_proc(void) +{ + pr_info("can: Can't create /proc/net/can. CONFIG_PROC_FS missing!\n"); +} + +static inline void can_remove_proc(void) +{ +} +#endif void can_stat_update(unsigned long data); /* structures and variables from af_can.c needed in proc.c for reading */ diff --git a/net/can/proc.c b/net/can/proc.c index 1a19b985a868..85ef7bb0f176 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -517,8 +517,7 @@ void can_init_proc(void) can_dir = proc_mkdir("can", init_net.proc_net); if (!can_dir) { - printk(KERN_INFO "can: failed to create /proc/net/can . " - "CONFIG_PROC_FS missing?\n"); + pr_info("can: failed to create /proc/net/can.\n"); return; } -- cgit v1.2.3 From 95acb490ec5145015b64cf4e99f604bb5fe79250 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 17 Jun 2016 15:35:24 +0200 Subject: can: bcm: fix indention and other minor style issues Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index 6863310d6973..17fb7967f8ca 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -184,42 +184,40 @@ static int bcm_proc_show(struct seq_file *m, void *v) continue; seq_printf(m, "rx_op: %03X %-5s ", - op->can_id, bcm_proc_getifname(ifname, op->ifindex)); + op->can_id, bcm_proc_getifname(ifname, op->ifindex)); seq_printf(m, "[%u]%c ", op->nframes, - (op->flags & RX_CHECK_DLC)?'d':' '); + (op->flags & RX_CHECK_DLC) ? 'd' : ' '); if (op->kt_ival1.tv64) seq_printf(m, "timeo=%lld ", - (long long) - ktime_to_us(op->kt_ival1)); + (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2.tv64) seq_printf(m, "thr=%lld ", - (long long) - ktime_to_us(op->kt_ival2)); + (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# recv %ld (%ld) => reduction: ", - op->frames_filtered, op->frames_abs); + op->frames_filtered, op->frames_abs); reduction = 100 - (op->frames_filtered * 100) / op->frames_abs; seq_printf(m, "%s%ld%%\n", - (reduction == 100)?"near ":"", reduction); + (reduction == 100) ? "near " : "", reduction); } list_for_each_entry(op, &bo->tx_ops, list) { seq_printf(m, "tx_op: %03X %s [%u] ", - op->can_id, - bcm_proc_getifname(ifname, op->ifindex), - op->nframes); + op->can_id, + bcm_proc_getifname(ifname, op->ifindex), + op->nframes); if (op->kt_ival1.tv64) seq_printf(m, "t1=%lld ", - (long long) ktime_to_us(op->kt_ival1)); + (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2.tv64) seq_printf(m, "t2=%lld ", - (long long) ktime_to_us(op->kt_ival2)); + (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# sent %ld\n", op->frames_abs); } @@ -282,7 +280,7 @@ static void bcm_can_tx(struct bcm_op *op) /* reached last frame? */ if (op->currframe >= op->nframes) op->currframe = 0; - out: +out: dev_put(dev); } -- cgit v1.2.3 From 72c8a89ad2e4de18849674f30589baa5ebb4fbc1 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 17 Jun 2016 15:35:25 +0200 Subject: can: bcm: use CAN frame instead of can_frame in comments can_frame is the name of the struct can_frame which is not meant in the corrected comments. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index 17fb7967f8ca..83aa6cf61de9 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -84,7 +84,7 @@ MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp "); MODULE_ALIAS("can-proto-2"); -/* easy access to can_frame payload */ +/* easy access to CAN frame payload */ static inline u64 GET_U64(const struct can_frame *cp) { return *(u64 *)cp->data; @@ -305,13 +305,13 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, memcpy(skb_put(skb, sizeof(*head)), head, sizeof(*head)); if (head->nframes) { - /* can_frames starting here */ + /* CAN frames starting here */ firstframe = (struct can_frame *)skb_tail_pointer(skb); memcpy(skb_put(skb, datalen), frames, datalen); /* - * the BCM uses the can_dlc-element of the can_frame + * the BCM uses the can_dlc-element of the CAN frame * structure for internal purposes. This is only * relevant for updates that are generated by the * BCM, where nframes is 1 @@ -492,7 +492,7 @@ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, return; } - /* do a real check in can_frame data section */ + /* do a real check in CAN frame data section */ if ((GET_U64(&op->frames[index]) & GET_U64(rxdata)) != (GET_U64(&op->frames[index]) & GET_U64(&op->last_frames[index]))) { @@ -501,7 +501,7 @@ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, } if (op->flags & RX_CHECK_DLC) { - /* do a real check in can_frame dlc */ + /* do a real check in CAN frame dlc */ if (rxdata->can_dlc != (op->last_frames[index].can_dlc & BCM_CAN_DLC_MASK)) { bcm_rx_update_and_send(op, &op->last_frames[index], @@ -554,7 +554,7 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) /* if user wants to be informed, when cyclic CAN-Messages come back */ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { - /* clear received can_frames to indicate 'nothing received' */ + /* clear received CAN frames to indicate 'nothing received' */ memset(op->last_frames, 0, op->nframes * CFSIZ); } @@ -840,7 +840,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!ifindex) return -ENODEV; - /* check nframes boundaries - we need at least one can_frame */ + /* check nframes boundaries - we need at least one CAN frame */ if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES) return -EINVAL; @@ -851,14 +851,14 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* update existing BCM operation */ /* - * Do we need more space for the can_frames than currently + * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; - /* update can_frames content */ + /* update CAN frames content */ for (i = 0; i < msg_head->nframes; i++) { err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ); @@ -883,7 +883,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->can_id = msg_head->can_id; - /* create array for can_frames and copy the data */ + /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { op->frames = kmalloc(msg_head->nframes * CFSIZ, GFP_KERNEL); @@ -966,7 +966,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (op->flags & STARTTIMER) { hrtimer_cancel(&op->timer); - /* spec: send can_frame when starting timer */ + /* spec: send CAN frame when starting timer */ op->flags |= TX_ANNOUNCE; } @@ -1015,7 +1015,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* update existing BCM operation */ /* - * Do we need more space for the can_frames than currently + * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ @@ -1023,7 +1023,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, return -E2BIG; if (msg_head->nframes) { - /* update can_frames content */ + /* update CAN frames content */ err = memcpy_from_msg((u8 *)op->frames, msg, msg_head->nframes * CFSIZ); if (err < 0) @@ -1048,7 +1048,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->nframes = msg_head->nframes; if (msg_head->nframes > 1) { - /* create array for can_frames and copy the data */ + /* create array for CAN frames and copy the data */ op->frames = kmalloc(msg_head->nframes * CFSIZ, GFP_KERNEL); if (!op->frames) { @@ -1056,7 +1056,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, return -ENOMEM; } - /* create and init array for received can_frames */ + /* create and init array for received CAN frames */ op->last_frames = kzalloc(msg_head->nframes * CFSIZ, GFP_KERNEL); if (!op->last_frames) { @@ -1327,7 +1327,7 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) break; case TX_SEND: - /* we need exactly one can_frame behind the msg head */ + /* we need exactly one CAN frame behind the msg head */ if ((msg_head.nframes != 1) || (size != CFSIZ + MHSIZ)) ret = -EINVAL; else -- cgit v1.2.3 From 2b5f5f5dc114219dcd848fb0ff344acb413c11ef Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 17 Jun 2016 15:35:26 +0200 Subject: can: bcm: unify bcm_msg_head handling and prepare function parameters Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 54 +++++++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index 83aa6cf61de9..f3bf3875313a 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -693,13 +693,13 @@ rx_starttimer: /* * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements */ -static struct bcm_op *bcm_find_op(struct list_head *ops, canid_t can_id, - int ifindex) +static struct bcm_op *bcm_find_op(struct list_head *ops, + struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op; list_for_each_entry(op, ops, list) { - if ((op->can_id == can_id) && (op->ifindex == ifindex)) + if ((op->can_id == mh->can_id) && (op->ifindex == ifindex)) return op; } @@ -742,12 +742,13 @@ static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) /* * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops) */ -static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex) +static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, + int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { - if ((op->can_id == can_id) && (op->ifindex == ifindex)) { + if ((op->can_id == mh->can_id) && (op->ifindex == ifindex)) { /* * Don't care if we're bound or not (due to netdev @@ -787,12 +788,13 @@ static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex) /* * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops) */ -static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex) +static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh, + int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { - if ((op->can_id == can_id) && (op->ifindex == ifindex)) { + if ((op->can_id == mh->can_id) && (op->ifindex == ifindex)) { list_del(&op->list); bcm_remove_op(op); return 1; /* done */ @@ -808,7 +810,7 @@ static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex) static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head, int ifindex) { - struct bcm_op *op = bcm_find_op(ops, msg_head->can_id, ifindex); + struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex); if (!op) return -EINVAL; @@ -845,8 +847,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, return -EINVAL; /* check the given can_id */ - op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex); - + op = bcm_find_op(&bo->tx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ @@ -1010,7 +1011,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, return -EINVAL; /* check the given can_id */ - op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex); + op = bcm_find_op(&bo->rx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ @@ -1192,7 +1193,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg) */ -static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) +static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, + int cfsiz) { struct sk_buff *skb; struct net_device *dev; @@ -1202,13 +1204,13 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) if (!ifindex) return -ENODEV; - skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), GFP_KERNEL); + skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL); if (!skb) return -ENOMEM; can_skb_reserve(skb); - err = memcpy_from_msg(skb_put(skb, CFSIZ), msg, CFSIZ); + err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz); if (err < 0) { kfree_skb(skb); return err; @@ -1230,7 +1232,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) if (err) return err; - return CFSIZ + MHSIZ; + return cfsiz + MHSIZ; } /* @@ -1248,7 +1250,15 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) return -ENOTCONN; /* check for valid message length from userspace */ - if (size < MHSIZ || (size - MHSIZ) % CFSIZ) + if (size < MHSIZ) + return -EINVAL; + + /* read message head information */ + ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ); + if (ret < 0) + return ret; + + if ((size - MHSIZ) % CFSIZ) return -EINVAL; /* check for alternative ifindex for this bcm_op */ @@ -1282,12 +1292,6 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } } - /* read message head information */ - - ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ); - if (ret < 0) - return ret; - lock_sock(sk); switch (msg_head.opcode) { @@ -1301,14 +1305,14 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) break; case TX_DELETE: - if (bcm_delete_tx_op(&bo->tx_ops, msg_head.can_id, ifindex)) + if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case RX_DELETE: - if (bcm_delete_rx_op(&bo->rx_ops, msg_head.can_id, ifindex)) + if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; @@ -1331,7 +1335,7 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if ((msg_head.nframes != 1) || (size != CFSIZ + MHSIZ)) ret = -EINVAL; else - ret = bcm_tx_send(msg, ifindex, sk); + ret = bcm_tx_send(msg, ifindex, sk, CFSIZ); break; default: -- cgit v1.2.3 From 6f3b911d5f29b98752e5da86a295210c0c4f4e14 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 17 Jun 2016 15:35:27 +0200 Subject: can: bcm: add support for CAN FD frames The programming API of the CAN_BCM depends on struct can_frame which is given as array directly behind the bcm_msg_head structure. To follow this schema for the CAN FD frames a new flag 'CAN_FD_FRAME' in the bcm_msg_head flags indicates that the concatenated CAN frame structures behind the bcm_msg_head are defined as struct canfd_frame. This patch adds the support to handle CAN and CAN FD frames on a per BCM-op base. Main changes: - generally use struct canfd_frames instead if struct can_frames - use canfd_frame.flags instead of can_frame.can_dlc for private BCM flags - make all CAN frame sizes depending on the new CAN_FD_FRAME flags - separate between CAN and CAN FD when sending/receiving frames Due to the dependence of the CAN_FD_FRAME flag the former binary interface for classic CAN frames remains stable. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 223 +++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 135 insertions(+), 88 deletions(-) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index f3bf3875313a..8e999ffdf28b 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1,7 +1,7 @@ /* * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content * - * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * Copyright (c) 2002-2016 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -67,27 +67,31 @@ */ #define MAX_NFRAMES 256 -/* use of last_frames[index].can_dlc */ +/* use of last_frames[index].flags */ #define RX_RECV 0x40 /* received data for this element */ #define RX_THR 0x80 /* element not been sent due to throttle feature */ -#define BCM_CAN_DLC_MASK 0x0F /* clean private flags in can_dlc by masking */ +#define BCM_CAN_FLAGS_MASK 0x3F /* to clean private flags after usage */ /* get best masking value for can_rx_register() for a given single can_id */ #define REGMASK(id) ((id & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) -#define CAN_BCM_VERSION CAN_VERSION +#define CAN_BCM_VERSION "20160617" MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp "); MODULE_ALIAS("can-proto-2"); -/* easy access to CAN frame payload */ -static inline u64 GET_U64(const struct can_frame *cp) +/* + * easy access to the first 64 bit of can(fd)_frame payload. cp->data is + * 64 bit aligned so the offset has to be multiples of 8 which is ensured + * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler(). + */ +static inline u64 get_u64(const struct canfd_frame *cp, int offset) { - return *(u64 *)cp->data; + return *(u64 *)(cp->data + offset); } struct bcm_op { @@ -101,13 +105,14 @@ struct bcm_op { struct tasklet_struct tsklet, thrtsklet; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; int rx_ifindex; + int cfsiz; u32 count; u32 nframes; u32 currframe; - struct can_frame *frames; - struct can_frame *last_frames; - struct can_frame sframe; - struct can_frame last_sframe; + struct canfd_frame *frames; + struct canfd_frame *last_frames; + struct canfd_frame sframe; + struct canfd_frame last_sframe; struct sock *sk; struct net_device *rx_reg_dev; }; @@ -136,7 +141,7 @@ static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv) return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); } -#define CFSIZ sizeof(struct can_frame) +#define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU) #define OPSIZ sizeof(struct bcm_op) #define MHSIZ sizeof(struct bcm_msg_head) @@ -183,10 +188,16 @@ static int bcm_proc_show(struct seq_file *m, void *v) if (!op->frames_abs) continue; - seq_printf(m, "rx_op: %03X %-5s ", - op->can_id, bcm_proc_getifname(ifname, op->ifindex)); - seq_printf(m, "[%u]%c ", op->nframes, - (op->flags & RX_CHECK_DLC) ? 'd' : ' '); + seq_printf(m, "rx_op: %03X %-5s ", op->can_id, + bcm_proc_getifname(ifname, op->ifindex)); + + if (op->flags & CAN_FD_FRAME) + seq_printf(m, "(%u)", op->nframes); + else + seq_printf(m, "[%u]", op->nframes); + + seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' '); + if (op->kt_ival1.tv64) seq_printf(m, "timeo=%lld ", (long long)ktime_to_us(op->kt_ival1)); @@ -206,10 +217,13 @@ static int bcm_proc_show(struct seq_file *m, void *v) list_for_each_entry(op, &bo->tx_ops, list) { - seq_printf(m, "tx_op: %03X %s [%u] ", - op->can_id, - bcm_proc_getifname(ifname, op->ifindex), - op->nframes); + seq_printf(m, "tx_op: %03X %s ", op->can_id, + bcm_proc_getifname(ifname, op->ifindex)); + + if (op->flags & CAN_FD_FRAME) + seq_printf(m, "(%u) ", op->nframes); + else + seq_printf(m, "[%u] ", op->nframes); if (op->kt_ival1.tv64) seq_printf(m, "t1=%lld ", @@ -246,7 +260,7 @@ static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; struct net_device *dev; - struct can_frame *cf = &op->frames[op->currframe]; + struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe; /* no target device? => exit */ if (!op->ifindex) @@ -258,7 +272,7 @@ static void bcm_can_tx(struct bcm_op *op) return; } - skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), gfp_any()); + skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any()); if (!skb) goto out; @@ -266,7 +280,7 @@ static void bcm_can_tx(struct bcm_op *op) can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; - memcpy(skb_put(skb, CFSIZ), cf, CFSIZ); + memcpy(skb_put(skb, op->cfsiz), cf, op->cfsiz); /* send with loopback */ skb->dev = dev; @@ -289,13 +303,13 @@ out: * (consisting of bcm_msg_head + x CAN frames) */ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, - struct can_frame *frames, int has_timestamp) + struct canfd_frame *frames, int has_timestamp) { struct sk_buff *skb; - struct can_frame *firstframe; + struct canfd_frame *firstframe; struct sockaddr_can *addr; struct sock *sk = op->sk; - unsigned int datalen = head->nframes * CFSIZ; + unsigned int datalen = head->nframes * op->cfsiz; int err; skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); @@ -306,18 +320,18 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, if (head->nframes) { /* CAN frames starting here */ - firstframe = (struct can_frame *)skb_tail_pointer(skb); + firstframe = (struct canfd_frame *)skb_tail_pointer(skb); memcpy(skb_put(skb, datalen), frames, datalen); /* - * the BCM uses the can_dlc-element of the CAN frame + * the BCM uses the flags-element of the canfd_frame * structure for internal purposes. This is only * relevant for updates that are generated by the * BCM, where nframes is 1 */ if (head->nframes == 1) - firstframe->can_dlc &= BCM_CAN_DLC_MASK; + firstframe->flags &= BCM_CAN_FLAGS_MASK; } if (has_timestamp) { @@ -404,7 +418,7 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) /* * bcm_rx_changed - create a RX_CHANGED notification due to changed content */ -static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data) +static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) { struct bcm_msg_head head; @@ -416,7 +430,7 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data) op->frames_filtered = op->frames_abs = 0; /* this element is not throttled anymore */ - data->can_dlc &= (BCM_CAN_DLC_MASK|RX_RECV); + data->flags &= (BCM_CAN_FLAGS_MASK|RX_RECV); head.opcode = RX_CHANGED; head.flags = op->flags; @@ -435,13 +449,13 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data) * 2. send a notification to the user (if possible) */ static void bcm_rx_update_and_send(struct bcm_op *op, - struct can_frame *lastdata, - const struct can_frame *rxdata) + struct canfd_frame *lastdata, + const struct canfd_frame *rxdata) { - memcpy(lastdata, rxdata, CFSIZ); + memcpy(lastdata, rxdata, op->cfsiz); /* mark as used and throttled by default */ - lastdata->can_dlc |= (RX_RECV|RX_THR); + lastdata->flags |= (RX_RECV|RX_THR); /* throttling mode inactive ? */ if (!op->kt_ival2.tv64) { @@ -479,33 +493,36 @@ rx_changed_settime: * received data stored in op->last_frames[] */ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, - const struct can_frame *rxdata) + const struct canfd_frame *rxdata) { + struct canfd_frame *cf = op->frames + op->cfsiz * index; + struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; + int i; + /* - * no one uses the MSBs of can_dlc for comparison, + * no one uses the MSBs of flags for comparison, * so we use it here to detect the first time of reception */ - if (!(op->last_frames[index].can_dlc & RX_RECV)) { + if (!(lcf->flags & RX_RECV)) { /* received data for the first time => send update to user */ - bcm_rx_update_and_send(op, &op->last_frames[index], rxdata); + bcm_rx_update_and_send(op, lcf, rxdata); return; } /* do a real check in CAN frame data section */ - - if ((GET_U64(&op->frames[index]) & GET_U64(rxdata)) != - (GET_U64(&op->frames[index]) & GET_U64(&op->last_frames[index]))) { - bcm_rx_update_and_send(op, &op->last_frames[index], rxdata); - return; + for (i = 0; i < rxdata->len; i += 8) { + if ((get_u64(cf, i) & get_u64(rxdata, i)) != + (get_u64(cf, i) & get_u64(lcf, i))) { + bcm_rx_update_and_send(op, lcf, rxdata); + return; + } } if (op->flags & RX_CHECK_DLC) { - /* do a real check in CAN frame dlc */ - if (rxdata->can_dlc != (op->last_frames[index].can_dlc & - BCM_CAN_DLC_MASK)) { - bcm_rx_update_and_send(op, &op->last_frames[index], - rxdata); + /* do a real check in CAN frame length */ + if (rxdata->len != lcf->len) { + bcm_rx_update_and_send(op, lcf, rxdata); return; } } @@ -555,7 +572,7 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) /* if user wants to be informed, when cyclic CAN-Messages come back */ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { /* clear received CAN frames to indicate 'nothing received' */ - memset(op->last_frames, 0, op->nframes * CFSIZ); + memset(op->last_frames, 0, op->nframes * op->cfsiz); } return HRTIMER_NORESTART; @@ -567,9 +584,11 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) static inline int bcm_rx_do_flush(struct bcm_op *op, int update, unsigned int index) { - if ((op->last_frames) && (op->last_frames[index].can_dlc & RX_THR)) { + struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; + + if ((op->last_frames) && (lcf->flags & RX_THR)) { if (update) - bcm_rx_changed(op, &op->last_frames[index]); + bcm_rx_changed(op, lcf); return 1; } return 0; @@ -634,15 +653,19 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) static void bcm_rx_handler(struct sk_buff *skb, void *data) { struct bcm_op *op = (struct bcm_op *)data; - const struct can_frame *rxframe = (struct can_frame *)skb->data; + const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data; unsigned int i; - /* disable timeout */ - hrtimer_cancel(&op->timer); - if (op->can_id != rxframe->can_id) return; + /* make sure to handle the correct frame type (CAN / CAN FD) */ + if (skb->len != op->cfsiz) + return; + + /* disable timeout */ + hrtimer_cancel(&op->timer); + /* save rx timestamp */ op->rx_stamp = skb->tstamp; /* save originator for recvfrom() */ @@ -673,13 +696,14 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data) * multiplex compare * * find the first multiplex mask that fits. - * Remark: The MUX-mask is stored in index 0 + * Remark: The MUX-mask is stored in index 0 - but only the + * first 64 bits of the frame data[] are relevant (CAN FD) */ for (i = 1; i < op->nframes; i++) { - if ((GET_U64(&op->frames[0]) & GET_U64(rxframe)) == - (GET_U64(&op->frames[0]) & - GET_U64(&op->frames[i]))) { + if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) == + (get_u64(op->frames, 0) & + get_u64(op->frames + op->cfsiz * i, 0))) { bcm_rx_cmp_to_index(op, i, rxframe); break; } @@ -699,7 +723,8 @@ static struct bcm_op *bcm_find_op(struct list_head *ops, struct bcm_op *op; list_for_each_entry(op, ops, list) { - if ((op->can_id == mh->can_id) && (op->ifindex == ifindex)) + if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && + (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) return op; } @@ -748,7 +773,8 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { - if ((op->can_id == mh->can_id) && (op->ifindex == ifindex)) { + if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && + (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { /* * Don't care if we're bound or not (due to netdev @@ -794,7 +820,8 @@ static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh, struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { - if ((op->can_id == mh->can_id) && (op->ifindex == ifindex)) { + if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && + (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { list_del(&op->list); bcm_remove_op(op); return 1; /* done */ @@ -835,6 +862,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; + struct canfd_frame *cf; unsigned int i; int err; @@ -861,19 +889,27 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* update CAN frames content */ for (i = 0; i < msg_head->nframes; i++) { - err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ); - if (op->frames[i].can_dlc > 8) - err = -EINVAL; + cf = op->frames + op->cfsiz * i; + err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); + + if (op->flags & CAN_FD_FRAME) { + if (cf->len > 64) + err = -EINVAL; + } else { + if (cf->len > 8) + err = -EINVAL; + } if (err < 0) return err; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ - op->frames[i].can_id = msg_head->can_id; + cf->can_id = msg_head->can_id; } } + op->flags = msg_head->flags; } else { /* insert new BCM operation for the given can_id */ @@ -882,11 +918,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!op) return -ENOMEM; - op->can_id = msg_head->can_id; + op->can_id = msg_head->can_id; + op->cfsiz = CFSIZ(msg_head->flags); + op->flags = msg_head->flags; /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { - op->frames = kmalloc(msg_head->nframes * CFSIZ, + op->frames = kmalloc(msg_head->nframes * op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); @@ -896,10 +934,17 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->frames = &op->sframe; for (i = 0; i < msg_head->nframes; i++) { - err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ); - if (op->frames[i].can_dlc > 8) - err = -EINVAL; + cf = op->frames + op->cfsiz * i; + err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); + + if (op->flags & CAN_FD_FRAME) { + if (cf->len > 64) + err = -EINVAL; + } else { + if (cf->len > 8) + err = -EINVAL; + } if (err < 0) { if (op->frames != &op->sframe) @@ -910,7 +955,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ - op->frames[i].can_id = msg_head->can_id; + cf->can_id = msg_head->can_id; } } @@ -945,8 +990,6 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* check flags */ - op->flags = msg_head->flags; - if (op->flags & TX_RESET_MULTI_IDX) { /* start multiple frame transmission with index 0 */ op->currframe = 0; @@ -980,7 +1023,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (op->flags & STARTTIMER) bcm_tx_start_timer(op); - return msg_head->nframes * CFSIZ + MHSIZ; + return msg_head->nframes * op->cfsiz + MHSIZ; } /* @@ -1026,15 +1069,16 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (msg_head->nframes) { /* update CAN frames content */ err = memcpy_from_msg((u8 *)op->frames, msg, - msg_head->nframes * CFSIZ); + msg_head->nframes * op->cfsiz); if (err < 0) return err; /* clear last_frames to indicate 'nothing received' */ - memset(op->last_frames, 0, msg_head->nframes * CFSIZ); + memset(op->last_frames, 0, msg_head->nframes * op->cfsiz); } op->nframes = msg_head->nframes; + op->flags = msg_head->flags; /* Only an update -> do not call can_rx_register() */ do_rx_register = 0; @@ -1045,12 +1089,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!op) return -ENOMEM; - op->can_id = msg_head->can_id; - op->nframes = msg_head->nframes; + op->can_id = msg_head->can_id; + op->nframes = msg_head->nframes; + op->cfsiz = CFSIZ(msg_head->flags); + op->flags = msg_head->flags; if (msg_head->nframes > 1) { /* create array for CAN frames and copy the data */ - op->frames = kmalloc(msg_head->nframes * CFSIZ, + op->frames = kmalloc(msg_head->nframes * op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); @@ -1058,7 +1104,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } /* create and init array for received CAN frames */ - op->last_frames = kzalloc(msg_head->nframes * CFSIZ, + op->last_frames = kzalloc(msg_head->nframes * op->cfsiz, GFP_KERNEL); if (!op->last_frames) { kfree(op->frames); @@ -1073,7 +1119,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (msg_head->nframes) { err = memcpy_from_msg((u8 *)op->frames, msg, - msg_head->nframes * CFSIZ); + msg_head->nframes * op->cfsiz); if (err < 0) { if (op->frames != &op->sframe) kfree(op->frames); @@ -1115,7 +1161,6 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */ /* check flags */ - op->flags = msg_head->flags; if (op->flags & RX_RTR_FRAME) { @@ -1187,7 +1232,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } } - return msg_head->nframes * CFSIZ + MHSIZ; + return msg_head->nframes * op->cfsiz + MHSIZ; } /* @@ -1244,6 +1289,7 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct bcm_sock *bo = bcm_sk(sk); int ifindex = bo->ifindex; /* default ifindex for this bcm_op */ struct bcm_msg_head msg_head; + int cfsiz; int ret; /* read bytes or error codes as return value */ if (!bo->bound) @@ -1258,7 +1304,8 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (ret < 0) return ret; - if ((size - MHSIZ) % CFSIZ) + cfsiz = CFSIZ(msg_head.flags); + if ((size - MHSIZ) % cfsiz) return -EINVAL; /* check for alternative ifindex for this bcm_op */ @@ -1332,10 +1379,10 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) case TX_SEND: /* we need exactly one CAN frame behind the msg head */ - if ((msg_head.nframes != 1) || (size != CFSIZ + MHSIZ)) + if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ)) ret = -EINVAL; else - ret = bcm_tx_send(msg, ifindex, sk, CFSIZ); + ret = bcm_tx_send(msg, ifindex, sk, cfsiz); break; default: -- cgit v1.2.3 From e7b3db5e60e8f471c3f5ef93b497bafe5863e56a Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 16 Jun 2016 12:20:52 -0700 Subject: net: Combine GENEVE and VXLAN port notifiers into single functions This patch merges the GENEVE and VXLAN code so that both functions pass through a shared code path. This way we can start the effort of using a single function on the network device drivers to handle both of these tunnel types. Signed-off-by: Alexander Duyck Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/udp_tunnel.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) (limited to 'net') diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 47f12c73d959..8174753e6494 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -76,6 +76,108 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); +static void __udp_tunnel_push_rx_port(struct net_device *dev, + struct udp_tunnel_info *ti) +{ + switch (ti->type) { + case UDP_TUNNEL_TYPE_VXLAN: + if (!dev->netdev_ops->ndo_add_vxlan_port) + break; + + dev->netdev_ops->ndo_add_vxlan_port(dev, + ti->sa_family, + ti->port); + break; + case UDP_TUNNEL_TYPE_GENEVE: + if (!dev->netdev_ops->ndo_add_geneve_port) + break; + + dev->netdev_ops->ndo_add_geneve_port(dev, + ti->sa_family, + ti->port); + break; + default: + break; + } +} + +void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, + unsigned short type) +{ + struct sock *sk = sock->sk; + struct udp_tunnel_info ti; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + __udp_tunnel_push_rx_port(dev, &ti); +} +EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); + +/* Notify netdevs that UDP port started listening */ +void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct udp_tunnel_info ti; + struct net_device *dev; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) + __udp_tunnel_push_rx_port(dev, &ti); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); + +static void __udp_tunnel_pull_rx_port(struct net_device *dev, + struct udp_tunnel_info *ti) +{ + switch (ti->type) { + case UDP_TUNNEL_TYPE_VXLAN: + if (!dev->netdev_ops->ndo_del_vxlan_port) + break; + + dev->netdev_ops->ndo_del_vxlan_port(dev, + ti->sa_family, + ti->port); + break; + case UDP_TUNNEL_TYPE_GENEVE: + if (!dev->netdev_ops->ndo_del_geneve_port) + break; + + dev->netdev_ops->ndo_del_geneve_port(dev, + ti->sa_family, + ti->port); + break; + default: + break; + } +} + +/* Notify netdevs that UDP port is no more listening */ +void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct udp_tunnel_info ti; + struct net_device *dev; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) + __udp_tunnel_pull_rx_port(dev, &ti); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); + void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, -- cgit v1.2.3 From 7c46a640de6fcc4f35d0702710356a024eadf68f Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 16 Jun 2016 12:21:00 -0700 Subject: net: Merge VXLAN and GENEVE push notifiers into a single notifier This patch merges the notifiers for VXLAN and GENEVE into a single UDP tunnel notifier. The idea is that we will want to only have to make one notifier call to receive the list of ports for VXLAN and GENEVE tunnels that need to be offloaded. In addition we add a new set of ndo functions named ndo_udp_tunnel_add and ndo_udp_tunnel_del that are meant to allow us to track the tunnel meta-data such as port and address family as tunnels are added and removed. The tunnel meta-data is now transported in a structure named udp_tunnel_info which for now carries the type, address family, and port number. In the future this could be updated so that we can include a tuple of values including things such as the destination IP address and other fields. I also ended up going with a naming scheme that consisted of using the prefix udp_tunnel on function names. I applied this to the notifier and ndo ops as well so that it hopefully points to the fact that these are primarily used in the udp_tunnel functions. Signed-off-by: Alexander Duyck Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/udp_tunnel.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 8174753e6494..683e494d9000 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -79,6 +79,11 @@ EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); static void __udp_tunnel_push_rx_port(struct net_device *dev, struct udp_tunnel_info *ti) { + if (dev->netdev_ops->ndo_udp_tunnel_add) { + dev->netdev_ops->ndo_udp_tunnel_add(dev, ti); + return; + } + switch (ti->type) { case UDP_TUNNEL_TYPE_VXLAN: if (!dev->netdev_ops->ndo_add_vxlan_port) @@ -137,6 +142,11 @@ EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); static void __udp_tunnel_pull_rx_port(struct net_device *dev, struct udp_tunnel_info *ti) { + if (dev->netdev_ops->ndo_udp_tunnel_del) { + dev->netdev_ops->ndo_udp_tunnel_del(dev, ti); + return; + } + switch (ti->type) { case UDP_TUNNEL_TYPE_VXLAN: if (!dev->netdev_ops->ndo_del_vxlan_port) -- cgit v1.2.3 From 1938ee1fd3de74d761a60806b048df652666afec Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 16 Jun 2016 12:23:12 -0700 Subject: net: Remove deprecated tunnel specific UDP offload functions Now that we have all the drivers using udp_tunnel_get_rx_ports, ndo_add_udp_enc_rx_port, and ndo_del_udp_enc_rx_port we can drop the function calls that were specific to VXLAN and GENEVE. Signed-off-by: Alexander Duyck Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/udp_tunnel.c | 79 +++++++++------------------------------------------ 1 file changed, 14 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 683e494d9000..58bd39fb14b4 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -76,47 +76,20 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); -static void __udp_tunnel_push_rx_port(struct net_device *dev, - struct udp_tunnel_info *ti) -{ - if (dev->netdev_ops->ndo_udp_tunnel_add) { - dev->netdev_ops->ndo_udp_tunnel_add(dev, ti); - return; - } - - switch (ti->type) { - case UDP_TUNNEL_TYPE_VXLAN: - if (!dev->netdev_ops->ndo_add_vxlan_port) - break; - - dev->netdev_ops->ndo_add_vxlan_port(dev, - ti->sa_family, - ti->port); - break; - case UDP_TUNNEL_TYPE_GENEVE: - if (!dev->netdev_ops->ndo_add_geneve_port) - break; - - dev->netdev_ops->ndo_add_geneve_port(dev, - ti->sa_family, - ti->port); - break; - default: - break; - } -} - void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; + if (!dev->netdev_ops->ndo_udp_tunnel_add) + return; + ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; - __udp_tunnel_push_rx_port(dev, &ti); + dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); @@ -133,42 +106,15 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) ti.port = inet_sk(sk)->inet_sport; rcu_read_lock(); - for_each_netdev_rcu(net, dev) - __udp_tunnel_push_rx_port(dev, &ti); + for_each_netdev_rcu(net, dev) { + if (!dev->netdev_ops->ndo_udp_tunnel_add) + continue; + dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); + } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); -static void __udp_tunnel_pull_rx_port(struct net_device *dev, - struct udp_tunnel_info *ti) -{ - if (dev->netdev_ops->ndo_udp_tunnel_del) { - dev->netdev_ops->ndo_udp_tunnel_del(dev, ti); - return; - } - - switch (ti->type) { - case UDP_TUNNEL_TYPE_VXLAN: - if (!dev->netdev_ops->ndo_del_vxlan_port) - break; - - dev->netdev_ops->ndo_del_vxlan_port(dev, - ti->sa_family, - ti->port); - break; - case UDP_TUNNEL_TYPE_GENEVE: - if (!dev->netdev_ops->ndo_del_geneve_port) - break; - - dev->netdev_ops->ndo_del_geneve_port(dev, - ti->sa_family, - ti->port); - break; - default: - break; - } -} - /* Notify netdevs that UDP port is no more listening */ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) { @@ -182,8 +128,11 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) ti.port = inet_sk(sk)->inet_sport; rcu_read_lock(); - for_each_netdev_rcu(net, dev) - __udp_tunnel_pull_rx_port(dev, &ti); + for_each_netdev_rcu(net, dev) { + if (!dev->netdev_ops->ndo_udp_tunnel_del) + continue; + dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); + } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); -- cgit v1.2.3 From a2e2ff560f5113e8ca31432fbd985f5f1889efdc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 16 Jun 2016 16:24:24 -0700 Subject: net: ipv6: Move ip6_route_get_saddr to inline VRF driver needs access to ip6_route_get_saddr code. Since it does little beyond ipv6_dev_get_saddr and ipv6_dev_get_saddr is already exported for modules move ip6_route_get_saddr to the header as an inline. Code move only; no functional change. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9e1516785dac..08b77f421268 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2586,23 +2586,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, return rt; } -int ip6_route_get_saddr(struct net *net, - struct rt6_info *rt, - const struct in6_addr *daddr, - unsigned int prefs, - struct in6_addr *saddr) -{ - struct inet6_dev *idev = - rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; - int err = 0; - if (rt && rt->rt6i_prefsrc.plen) - *saddr = rt->rt6i_prefsrc.addr; - else - err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, - daddr, prefs, saddr); - return err; -} - /* remove deleted ip from prefsrc entries */ struct arg_dev_net_ip { struct net_device *dev; -- cgit v1.2.3 From 0d240e7811c4ec1965760ee4643b5bbc9cfacbb3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 16 Jun 2016 16:24:25 -0700 Subject: net: vrf: Implement get_saddr for IPv6 IPv6 source address selection needs to consider the real egress route. Similar to IPv4 implement a get_saddr6 method which is called if source address has not been set. The get_saddr6 method does a full lookup which means pulling a route from the VRF FIB table and properly considering linklocal/multicast destination addresses. Lookup failures (eg., unreachable) then cause the source address selection to fail which gets propagated back to the caller. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 12 ++++++++++-- net/l3mdev/l3mdev.c | 24 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fd3217579b65..1dfc402d9ad1 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -910,6 +910,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, int err; int flags = 0; + if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif && + (!*dst || !(*dst)->error)) { + err = l3mdev_get_saddr6(net, sk, fl6); + if (err) + goto out_err; + } + /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, * the route-specific preferred source forces the @@ -999,10 +1006,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, return 0; out_err_release: - if (err == -ENETUNREACH) - IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); dst_release(*dst); *dst = NULL; +out_err: + if (err == -ENETUNREACH) + IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); return err; } diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index d90e4ef09e85..c4a1c3e84e12 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -162,6 +162,30 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4) } EXPORT_SYMBOL_GPL(l3mdev_get_saddr); +int l3mdev_get_saddr6(struct net *net, const struct sock *sk, + struct flowi6 *fl6) +{ + struct net_device *dev; + int rc = 0; + + if (fl6->flowi6_oif) { + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); + if (dev && netif_is_l3_slave(dev)) + dev = netdev_master_upper_dev_get_rcu(dev); + + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_get_saddr6) + rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6); + + rcu_read_unlock(); + } + + return rc; +} +EXPORT_SYMBOL_GPL(l3mdev_get_saddr6); + /** * l3mdev_fib_rule_match - Determine if flowi references an * L3 master device -- cgit v1.2.3 From afbac6010aec514998214fb19a1f37732b7a1d77 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 16 Jun 2016 16:24:26 -0700 Subject: net: ipv6: Address selection needs to consider L3 domains IPv6 version of 3f2fb9a834cb ("net: l3mdev: address selection should only consider devices in L3 domain") and the follow up commit, a17b693cdd876 ("net: l3mdev: prefer VRF master for source address selection"). That is, if outbound device is given then the address preference order is an address from that device, an address from the master device if it is enslaved, and then an address from a device in the same L3 domain. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6c8fc3f96b11..a1f6b7b31531 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1524,6 +1524,28 @@ out: return hiscore_idx; } +static int ipv6_get_saddr_master(struct net *net, + const struct net_device *dst_dev, + const struct net_device *master, + struct ipv6_saddr_dst *dst, + struct ipv6_saddr_score *scores, + int hiscore_idx) +{ + struct inet6_dev *idev; + + idev = __in6_dev_get(dst_dev); + if (idev) + hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev, + scores, hiscore_idx); + + idev = __in6_dev_get(master); + if (idev) + hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev, + scores, hiscore_idx); + + return hiscore_idx; +} + int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) @@ -1577,13 +1599,39 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, if (idev) hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } else { + const struct net_device *master; + int master_idx = 0; + + /* if dst_dev exists and is enslaved to an L3 device, then + * prefer addresses from dst_dev and then the master over + * any other enslaved devices in the L3 domain. + */ + master = l3mdev_master_dev_rcu(dst_dev); + if (master) { + master_idx = master->ifindex; + + hiscore_idx = ipv6_get_saddr_master(net, dst_dev, + master, &dst, + scores, hiscore_idx); + + if (scores[hiscore_idx].ifa) + goto out; + } + for_each_netdev_rcu(net, dev) { + /* only consider addresses on devices in the + * same L3 domain + */ + if (l3mdev_master_ifindex_rcu(dev) != master_idx) + continue; idev = __in6_dev_get(dev); if (!idev) continue; hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } } + +out: rcu_read_unlock(); hiscore = &scores[hiscore_idx]; -- cgit v1.2.3 From 0350cb48fb94e168d8b4d3ff65adbdbc73759cbf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 17 Jun 2016 12:22:26 +0300 Subject: tipc: potential shift wrapping bug in map_set() "up_map" is a u64 type but we're not using the high 32 bits. Fixes: 35c55c9877f8 ('tipc: add neighbor monitoring framework') Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/tipc/monitor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 87d4efedd09f..0d489e81fcca 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -122,8 +122,8 @@ static int dom_size(int peers) static void map_set(u64 *up_map, int i, unsigned int v) { - *up_map &= ~(1 << i); - *up_map |= (v << i); + *up_map &= ~(1ULL << i); + *up_map |= ((u64)v << i); } static int map_get(u64 up_map, int i) -- cgit v1.2.3 From af73e72dccaf33d4ae2e9ccf48eeb527c5f24e1a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 17 Jun 2016 18:12:46 +0000 Subject: RDS: TCP: Fix non static symbol warnings Fixes the following sparse warnings: net/rds/tcp.c:59:5: warning: symbol 'rds_tcp_min_sndbuf' was not declared. Should it be static? net/rds/tcp.c:60:5: warning: symbol 'rds_tcp_min_rcvbuf' was not declared. Should it be static? Signed-off-by: Wei Yongjun Acked-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 0e757a0d7421..5217d49ce6d6 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -57,8 +57,8 @@ static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *fpos); -int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; -int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; +static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; +static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; static struct ctl_table rds_tcp_sysctl_table[] = { #define RDS_TCP_SNDBUF 0 -- cgit v1.2.3 From b1cadc1a0949c82ff7fcb15603e3caf2d32ff9f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 18 Jun 2016 21:52:02 -0700 Subject: ipv6: icmp: add a force_saddr param to icmp6_send() SIT or GRE tunnels might want to translate an IPV4 address into a v4mapped one when translating ICMP to ICMPv6. This patch adds the parameter to icmp6_send() but does not change icmpv6_send() signature. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 7 +++++-- net/ipv6/ip6_icmp.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index e32a72fb9982..6c57e6e90301 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -388,7 +388,8 @@ relookup_failed: /* * Send an ICMP message in response to a packet in error */ -static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct in6_addr *force_saddr) { struct net *net = dev_net(skb->dev); struct inet6_dev *idev = NULL; @@ -475,6 +476,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = hdr->saddr; + if (force_saddr) + saddr = force_saddr; if (saddr) fl6.saddr = *saddr; fl6.flowi6_mark = mark; @@ -551,7 +554,7 @@ out: */ void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) { - icmp6_send(skb, ICMPV6_PARAMPROB, code, pos); + icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL); kfree_skb(skb); } diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 14dacc544c3e..713676f14a0e 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -39,7 +39,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) if (!send) goto out; - send(skb, type, code, info); + send(skb, type, code, info, NULL); out: rcu_read_unlock(); } -- cgit v1.2.3 From 5fbba8ac9358f1e796c8aedcccc3487364643723 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 18 Jun 2016 21:52:03 -0700 Subject: ip6: move ipip6_err_gen_icmpv6_unreach() We want to use this helper from GRE as well, so this is the time to move it in net/ipv6/icmp.c Also add a @nhs parameter, since SIT and GRE have different values for the header(s) to skip. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 39 +++++++++++++++++++++++++++++++++++++++ net/ipv6/sit.c | 38 +------------------------------------- 2 files changed, 40 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 6c57e6e90301..07bc63c23712 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -558,6 +558,45 @@ void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) kfree_skb(skb); } +/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH + * if sufficient data bytes are available + * @nhs is the size of the tunnel header(s) : + * Either an IPv4 header for SIT encap + * an IPv4 header + GRE header for GRE encap + */ +int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs) +{ + struct rt6_info *rt; + struct sk_buff *skb2; + + if (!pskb_may_pull(skb, nhs + sizeof(struct ipv6hdr) + 8)) + return 1; + + skb2 = skb_clone(skb, GFP_ATOMIC); + + if (!skb2) + return 1; + + skb_dst_drop(skb2); + skb_pull(skb2, nhs); + skb_reset_network_header(skb2); + + rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0); + + if (rt && rt->dst.dev) + skb2->dev = rt->dst.dev; + + icmpv6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); + + if (rt) + ip6_rt_put(rt); + + kfree_skb(skb2); + + return 0; +} +EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); + static void icmpv6_echo_reply(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d9f2bd6ef72d..78e84d6793ee 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -479,42 +479,6 @@ static void ipip6_tunnel_uninit(struct net_device *dev) dev_put(dev); } -/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH - * if sufficient data bytes are available - */ -static int ipip6_err_gen_icmpv6_unreach(struct sk_buff *skb) -{ - int ihl = ((const struct iphdr *)skb->data)->ihl*4; - struct rt6_info *rt; - struct sk_buff *skb2; - - if (!pskb_may_pull(skb, ihl + sizeof(struct ipv6hdr) + 8)) - return 1; - - skb2 = skb_clone(skb, GFP_ATOMIC); - - if (!skb2) - return 1; - - skb_dst_drop(skb2); - skb_pull(skb2, ihl); - skb_reset_network_header(skb2); - - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0); - - if (rt && rt->dst.dev) - skb2->dev = rt->dst.dev; - - icmpv6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); - - if (rt) - ip6_rt_put(rt); - - kfree_skb(skb2); - - return 0; -} - static int ipip6_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; @@ -575,7 +539,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info) goto out; err = 0; - if (!ipip6_err_gen_icmpv6_unreach(skb)) + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4)) goto out; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) -- cgit v1.2.3 From 2d7a3b276be2d032a6c1a48ced87a474327ee3d3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 18 Jun 2016 21:52:04 -0700 Subject: ipv6: translate ICMP_TIME_EXCEEDED to ICMPV6_TIME_EXCEED For better traceroute/mtr support for SIT and GRE tunnels, we translate IPV4 ICMP ICMP_TIME_EXCEEDED to ICMPV6_TIME_EXCEED We also have to translate the IPv4 source IP address of ICMP message to IPv6 v4mapped. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 12 +++++++++--- net/ipv6/sit.c | 6 +++--- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 07bc63c23712..867aebc34248 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -564,8 +564,9 @@ void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) * Either an IPv4 header for SIT encap * an IPv4 header + GRE header for GRE encap */ -int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs) +int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type) { + struct in6_addr temp_saddr; struct rt6_info *rt; struct sk_buff *skb2; @@ -586,8 +587,13 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs) if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; - icmpv6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); - + ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &temp_saddr); + if (type == ICMP_TIME_EXCEEDED) + icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, + 0, &temp_saddr); + else + icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, + 0, &temp_saddr); if (rt) ip6_rt_put(rt); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 78e84d6793ee..d7a36114eb50 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -535,11 +535,11 @@ static int ipip6_err(struct sk_buff *skb, u32 info) goto out; } - if (t->parms.iph.daddr == 0) + err = 0; + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type)) goto out; - err = 0; - if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4)) + if (t->parms.iph.daddr == 0) goto out; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) -- cgit v1.2.3 From 9b8c6d7bf2e08a7d3eb6660a2bfaf29b8b49c329 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 18 Jun 2016 21:52:05 -0700 Subject: gre: better support for ICMP messages for gre+ipv6 ipgre_err() can call ip6_err_gen_icmpv6_unreach() for proper support of ipv4+gre+icmp+ipv6+... frames, used for example by traceroute/mtr. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/gre_demux.c | 1 + net/ipv4/ip_gre.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 4c39f4fd332a..c4c3e439f424 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -117,6 +117,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if ((*(u8 *)options & 0xF0) != 0x40) hdr_len += 4; } + tpi->hdr_len = hdr_len; return hdr_len; } EXPORT_SYMBOL(gre_parse_header); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0f8ca3fca00a..ab4cff8e563d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -187,6 +187,12 @@ static void ipgre_err(struct sk_buff *skb, u32 info, if (!t) return; +#if IS_ENABLED(CONFIG_IPV6) + if (tpi->proto == htons(ETH_P_IPV6) && + !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, type)) + return; +#endif + if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) return; -- cgit v1.2.3 From 20e1954fe238dbe5f8d3a979e593fe352bd703cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 18 Jun 2016 21:52:06 -0700 Subject: ipv6: RFC 4884 partial support for SIT/GRE tunnels When receiving an ICMPv4 message containing extensions as defined in RFC 4884, and translating it to ICMPv6 at SIT or GRE tunnel, we need some extra manipulation in order to properly forward the extensions. This patch only takes care of Time Exceeded messages as they are the ones that typically carry information from various routers in a fabric during a traceroute session. It also avoids complex skb logic if the data_len is not a multiple of 8. RFC states : The "original datagram" field MUST contain at least 128 octets. If the original datagram did not contain 128 octets, the "original datagram" field MUST be zero padded to 128 octets. In practice routers use 128 bytes of original datagram, not more. Initial translation was added in commit ca15a078bd90 ("sit: generate icmpv6 error when receiving icmpv4 error") Signed-off-by: Eric Dumazet Cc: Oussama Ghorbel Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 5 ++++- net/ipv6/icmp.c | 28 ++++++++++++++++++++++++---- net/ipv6/sit.c | 4 +++- 3 files changed, 31 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ab4cff8e563d..8eec78f53f9e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -144,6 +144,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info, const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + unsigned int data_len = 0; struct ip_tunnel *t; switch (type) { @@ -169,6 +170,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return; + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: @@ -189,7 +191,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info, #if IS_ENABLED(CONFIG_IPV6) if (tpi->proto == htons(ETH_P_IPV6) && - !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, type)) + !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) return; #endif diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 867aebc34248..fd11f5856ce8 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -564,16 +564,22 @@ void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) * Either an IPv4 header for SIT encap * an IPv4 header + GRE header for GRE encap */ -int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type) +int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, + unsigned int data_len) { struct in6_addr temp_saddr; struct rt6_info *rt; struct sk_buff *skb2; + u32 info = 0; if (!pskb_may_pull(skb, nhs + sizeof(struct ipv6hdr) + 8)) return 1; - skb2 = skb_clone(skb, GFP_ATOMIC); + /* RFC 4884 (partial) support for ICMP extensions */ + if (data_len < 128 || (data_len & 7) || skb->len < data_len) + data_len = 0; + + skb2 = data_len ? skb_copy(skb, GFP_ATOMIC) : skb_clone(skb, GFP_ATOMIC); if (!skb2) return 1; @@ -588,12 +594,26 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type) skb2->dev = rt->dst.dev; ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &temp_saddr); + + if (data_len) { + /* RFC 4884 (partial) support : + * insert 0 padding at the end, before the extensions + */ + __skb_push(skb2, nhs); + skb_reset_network_header(skb2); + memmove(skb2->data, skb2->data + nhs, data_len - nhs); + memset(skb2->data + data_len - nhs, 0, nhs); + /* RFC 4884 4.5 : Length is measured in 64-bit words, + * and stored in reserved[0] + */ + info = (data_len/8) << 24; + } if (type == ICMP_TIME_EXCEEDED) icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, - 0, &temp_saddr); + info, &temp_saddr); else icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, - 0, &temp_saddr); + info, &temp_saddr); if (rt) ip6_rt_put(rt); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d7a36114eb50..cdd714690f95 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -484,6 +484,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info) const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + unsigned int data_len = 0; struct ip_tunnel *t; int err; @@ -508,6 +509,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info) case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: break; @@ -536,7 +538,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info) } err = 0; - if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type)) + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len)) goto out; if (t->parms.iph.daddr == 0) -- cgit v1.2.3 From 0e4699e4a37be4cfa07f0340cef6a3fa6a46f5f8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 18 Jun 2016 11:44:03 +0300 Subject: rxrpc: checking for IS_ERR() instead of NULL rxrpc_lookup_peer_rcu() and rxrpc_lookup_peer() return NULL on error, never error pointers, so IS_ERR() can't be used. Fix three callers of those functions. Fixes: be6e6707f6ee ('rxrpc: Rework peer object handling to use hash table and RCU') Signed-off-by: Dan Carpenter Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 4 ++-- net/rxrpc/call_accept.c | 2 +- net/rxrpc/input.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index c83c3c75d665..9dd160bb16d2 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -247,8 +247,8 @@ struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_sock *rx, /* find a remote transport endpoint from the local one */ peer = rxrpc_lookup_peer(rx->local, srx, gfp); - if (IS_ERR(peer)) - return ERR_CAST(peer); + if (!peer) + return ERR_PTR(-ENOMEM); /* find a transport */ trans = rxrpc_get_transport(rx->local, peer, gfp); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 50136c76ebd1..553b67c144e5 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -96,7 +96,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, notification->mark = RXRPC_SKB_MARK_NEW_CALL; peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); - if (IS_ERR(peer)) { + if (!peer) { _debug("no peer"); ret = -EBUSY; goto error; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 47fb167af3e4..e11e4d785127 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -639,7 +639,7 @@ static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, rxrpc_get_addr_from_skb(local, skb, &srx); rcu_read_lock(); peer = rxrpc_lookup_peer_rcu(local, &srx); - if (IS_ERR(peer)) + if (!peer) goto cant_find_peer; trans = rxrpc_find_transport(local, peer); -- cgit v1.2.3 From 2f9f9f5210887b1019fbd0327ffdf7c3aff271fd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Jun 2016 11:55:22 +0200 Subject: rxrpc: fix uninitialized variable use Hashing the peer key was introduced for AF_INET, but gcc warns about the rxrpc_peer_hash_key function returning uninitialized data for any other value of srx->transport.family: net/rxrpc/peer_object.c: In function 'rxrpc_peer_hash_key': net/rxrpc/peer_object.c:57:15: error: 'p' may be used uninitialized in this function [-Werror=maybe-uninitialized] Assuming that nothing else can be set here, this changes the function to just return zero in case of an unknown address family. Fixes: be6e6707f6ee ("rxrpc: Rework peer object handling to use hash table and RCU") Signed-off-by: Arnd Bergmann Signed-off-by: David Howells --- net/rxrpc/peer_object.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index faf222c21698..6baad708f3b1 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -50,6 +50,9 @@ static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local, size = sizeof(srx->transport.sin.sin_addr); p = (u16 *)&srx->transport.sin.sin_addr; break; + default: + WARN(1, "AF_RXRPC: Unsupported transport address family\n"); + return 0; } /* Step through the peer address in 16-bit portions for speed */ -- cgit v1.2.3 From 19ffa01c9c45861ad6b181323e0d36904298e326 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:36 +0100 Subject: rxrpc: Use structs to hold connection params and protocol info Define and use a structure to hold connection parameters. This makes it easier to pass multiple connection parameters around. Define and use a structure to hold protocol information used to hash a connection for lookup on incoming packet. Most of these fields will be disposed of eventually, including the duplicate local pointer. Whilst we're at it rename "proto" to "family" when referring to a protocol family. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 55 +++++++++++++++----------- net/rxrpc/ar-internal.h | 61 ++++++++++++++++++++++------ net/rxrpc/call_event.c | 6 +-- net/rxrpc/call_object.c | 44 +++++++++++---------- net/rxrpc/conn_event.c | 8 ++-- net/rxrpc/conn_object.c | 103 ++++++++++++++++++++++++++---------------------- net/rxrpc/input.c | 4 +- net/rxrpc/key.c | 2 +- net/rxrpc/output.c | 24 +++++++---- net/rxrpc/proc.c | 12 +++--- net/rxrpc/recvmsg.c | 2 +- net/rxrpc/rxkad.c | 62 ++++++++++++++--------------- net/rxrpc/security.c | 6 +-- 13 files changed, 226 insertions(+), 163 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 9dd160bb16d2..48b45a0280c0 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -97,7 +97,7 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, srx->transport_len > len) return -EINVAL; - if (srx->transport.family != rx->proto) + if (srx->transport.family != rx->family) return -EAFNOSUPPORT; switch (srx->transport.family) { @@ -227,32 +227,30 @@ static int rxrpc_listen(struct socket *sock, int backlog) /* * find a transport by address */ -struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_sock *rx, - struct sockaddr *addr, - int addr_len, int flags, - gfp_t gfp) +struct rxrpc_transport * +rxrpc_name_to_transport(struct rxrpc_conn_parameters *cp, + struct sockaddr *addr, + int addr_len, + gfp_t gfp) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr; struct rxrpc_transport *trans; - struct rxrpc_peer *peer; - _enter("%p,%p,%d,%d", rx, addr, addr_len, flags); - - ASSERT(rx->local != NULL); + _enter("%p,%d", addr, addr_len); - if (rx->srx.transport_type != srx->transport_type) + if (cp->local->srx.transport_type != srx->transport_type) return ERR_PTR(-ESOCKTNOSUPPORT); - if (rx->srx.transport.family != srx->transport.family) + if (cp->local->srx.transport.family != srx->transport.family) return ERR_PTR(-EAFNOSUPPORT); /* find a remote transport endpoint from the local one */ - peer = rxrpc_lookup_peer(rx->local, srx, gfp); - if (!peer) + cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); + if (!cp->peer) return ERR_PTR(-ENOMEM); /* find a transport */ - trans = rxrpc_get_transport(rx->local, peer, gfp); - rxrpc_put_peer(peer); + trans = rxrpc_get_transport(cp->local, cp->peer, gfp); + rxrpc_put_peer(cp->peer); _leave(" = %p", trans); return trans; } @@ -277,6 +275,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, unsigned long user_call_ID, gfp_t gfp) { + struct rxrpc_conn_parameters cp; struct rxrpc_conn_bundle *bundle; struct rxrpc_transport *trans; struct rxrpc_call *call; @@ -286,18 +285,26 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, lock_sock(&rx->sk); - trans = rxrpc_name_to_transport(rx, (struct sockaddr *)srx, - sizeof(*srx), 0, gfp); + if (!key) + key = rx->key; + if (key && !key->payload.data[0]) + key = NULL; /* a no-security key */ + + memset(&cp, 0, sizeof(cp)); + cp.local = rx->local; + cp.key = key; + cp.security_level = 0; + cp.exclusive = false; + cp.service_id = srx->srx_service; + + trans = rxrpc_name_to_transport(&cp, (struct sockaddr *)srx, + sizeof(*srx), gfp); if (IS_ERR(trans)) { call = ERR_CAST(trans); trans = NULL; goto out_notrans; } - - if (!key) - key = rx->key; - if (key && !key->payload.data[0]) - key = NULL; /* a no-security key */ + cp.peer = trans->peer; bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, gfp); if (IS_ERR(bundle)) { @@ -305,7 +312,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, goto out; } - call = rxrpc_new_client_call(rx, trans, bundle, user_call_ID, gfp); + call = rxrpc_new_client_call(rx, &cp, trans, bundle, user_call_ID, gfp); rxrpc_put_bundle(trans, bundle); out: rxrpc_put_transport(trans); @@ -600,7 +607,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, sk->sk_destruct = rxrpc_sock_destructor; rx = rxrpc_sk(sk); - rx->proto = protocol; + rx->family = protocol; rx->calls = RB_ROOT; INIT_LIST_HEAD(&rx->listen_link); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c168268467cd..efe6673deb28 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -72,7 +72,7 @@ struct rxrpc_sock { #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT struct sockaddr_rxrpc srx; /* local address */ struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */ - sa_family_t proto; /* protocol created with */ + sa_family_t family; /* protocol family created with */ }; #define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk) @@ -261,6 +261,34 @@ struct rxrpc_conn_bundle { u8 security_ix; /* security type */ }; +/* + * Keys for matching a connection. + */ +struct rxrpc_conn_proto { + unsigned long hash_key; + struct rxrpc_local *local; /* Representation of local endpoint */ + u32 epoch; /* epoch of this connection */ + u32 cid; /* connection ID */ + u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */ + u8 addr_size; /* Size of the address */ + sa_family_t family; /* Transport protocol */ + __be16 port; /* Peer UDP/UDP6 port */ + union { /* Peer address */ + struct in_addr ipv4_addr; + struct in6_addr ipv6_addr; + u32 raw_addr[0]; + }; +}; + +struct rxrpc_conn_parameters { + struct rxrpc_local *local; /* Representation of local endpoint */ + struct rxrpc_peer *peer; /* Remote endpoint */ + struct key *key; /* Security details */ + bool exclusive; /* T if conn is exclusive */ + u16 service_id; /* Service ID for this connection */ + u32 security_level; /* Security level selected */ +}; + /* * RxRPC connection definition * - matched by { transport, service_id, conn_id, direction, key } @@ -269,6 +297,9 @@ struct rxrpc_conn_bundle { struct rxrpc_connection { struct rxrpc_transport *trans; /* transport session */ struct rxrpc_conn_bundle *bundle; /* connection bundle (client) */ + struct rxrpc_conn_proto proto; + struct rxrpc_conn_parameters params; + struct work_struct processor; /* connection event processor */ struct rb_node node; /* node in transport's lookup tree */ struct list_head link; /* link in master connection list */ @@ -277,7 +308,6 @@ struct rxrpc_connection { struct sk_buff_head rx_queue; /* received conn-level packets */ struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* channels (active calls) */ const struct rxrpc_security *security; /* applied security module */ - struct key *key; /* security for this connection (client) */ struct key *server_key; /* security for this service */ struct crypto_skcipher *cipher; /* encryption handle */ struct rxrpc_crypt csum_iv; /* packet checksum base */ @@ -308,13 +338,8 @@ struct rxrpc_connection { u8 size_align; /* data size alignment (for security) */ u8 header_size; /* rxrpc + security header size */ u8 security_size; /* security header size */ - u32 security_level; /* security level negotiated */ u32 security_nonce; /* response re-use preventer */ - u32 epoch; /* epoch of this connection */ - u32 cid; /* connection ID */ - u16 service_id; /* service ID for this connection */ u8 security_ix; /* security type */ - u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ }; @@ -448,7 +473,7 @@ struct rxrpc_call { unsigned long hash_key; /* Full hash key */ u8 in_clientflag; /* Copy of conn->in_clientflag for hashing */ struct rxrpc_local *local; /* Local endpoint. Used for hashing. */ - sa_family_t proto; /* Frame protocol */ + sa_family_t family; /* Frame protocol */ u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ u32 epoch; /* epoch of this connection */ @@ -481,9 +506,9 @@ extern u32 rxrpc_epoch; extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; -extern struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_sock *, +extern struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_conn_parameters *, struct sockaddr *, - int, int, gfp_t); + int, gfp_t); /* * call_accept.c @@ -512,6 +537,7 @@ struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *, void *, sa_family_t, const void *); struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, + struct rxrpc_conn_parameters *, struct rxrpc_transport *, struct rxrpc_conn_bundle *, unsigned long, gfp_t); @@ -541,8 +567,9 @@ struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *, struct rxrpc_transport *, struct key *, u16, gfp_t); void rxrpc_put_bundle(struct rxrpc_transport *, struct rxrpc_conn_bundle *); -int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *, - struct rxrpc_conn_bundle *, struct rxrpc_call *, gfp_t); +int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, + struct rxrpc_transport *, struct rxrpc_conn_bundle *, + struct rxrpc_call *, gfp_t); void rxrpc_put_connection(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *, @@ -550,6 +577,16 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *, extern struct rxrpc_connection * rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *); +static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) +{ + return conn->out_clientflag; +} + +static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) +{ + return conn->proto.in_clientflag; +} + /* * input.c */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index e610b106c913..1571dfb95aa3 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -842,7 +842,7 @@ void rxrpc_process_call(struct work_struct *work) msg.msg_controllen = 0; msg.msg_flags = 0; - whdr.epoch = htonl(call->conn->epoch); + whdr.epoch = htonl(call->conn->proto.epoch); whdr.cid = htonl(call->cid); whdr.callNumber = htonl(call->call_id); whdr.seq = 0; @@ -1264,7 +1264,7 @@ maybe_reschedule: if (call->state >= RXRPC_CALL_COMPLETE && !list_empty(&call->accept_link)) { _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }", - call, call->events, call->flags, call->conn->cid); + call, call->events, call->flags, call->conn->proto.cid); read_lock_bh(&call->state_lock); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && @@ -1282,7 +1282,7 @@ error: * this means there's a race between clearing the flag and setting the * work pending bit and the work item being processed again */ if (call->events && !work_pending(&call->processor)) { - _debug("jumpstart %x", call->conn->cid); + _debug("jumpstart %x", call->conn->proto.cid); rxrpc_queue_call(call); } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 8b4d47b3ccac..b7c6011c71bb 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -71,7 +71,7 @@ static unsigned long rxrpc_call_hashfunc( u32 call_id, u32 epoch, u16 service_id, - sa_family_t proto, + sa_family_t family, void *localptr, unsigned int addr_size, const u8 *peer_addr) @@ -92,7 +92,7 @@ static unsigned long rxrpc_call_hashfunc( key += (cid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT; key += cid & RXRPC_CHANNELMASK; key += in_clientflag; - key += proto; + key += family; /* Step through the peer address in 16-bit portions for speed */ for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++) key += *p; @@ -109,7 +109,7 @@ static void rxrpc_call_hash_add(struct rxrpc_call *call) unsigned int addr_size = 0; _enter(""); - switch (call->proto) { + switch (call->family) { case AF_INET: addr_size = sizeof(call->peer_ip.ipv4_addr); break; @@ -121,7 +121,7 @@ static void rxrpc_call_hash_add(struct rxrpc_call *call) } key = rxrpc_call_hashfunc(call->in_clientflag, call->cid, call->call_id, call->epoch, - call->service_id, call->proto, + call->service_id, call->family, call->conn->trans->local, addr_size, call->peer_ip.ipv6_addr); /* Store the full key in the call */ @@ -151,7 +151,7 @@ static void rxrpc_call_hash_del(struct rxrpc_call *call) struct rxrpc_call *rxrpc_find_call_hash( struct rxrpc_host_header *hdr, void *localptr, - sa_family_t proto, + sa_family_t family, const void *peer_addr) { unsigned long key; @@ -161,7 +161,7 @@ struct rxrpc_call *rxrpc_find_call_hash( u8 in_clientflag = hdr->flags & RXRPC_CLIENT_INITIATED; _enter(""); - switch (proto) { + switch (family) { case AF_INET: addr_size = sizeof(call->peer_ip.ipv4_addr); break; @@ -174,7 +174,7 @@ struct rxrpc_call *rxrpc_find_call_hash( key = rxrpc_call_hashfunc(in_clientflag, hdr->cid, hdr->callNumber, hdr->epoch, hdr->serviceId, - proto, localptr, addr_size, + family, localptr, addr_size, peer_addr); hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) { if (call->hash_key == key && @@ -182,7 +182,7 @@ struct rxrpc_call *rxrpc_find_call_hash( call->cid == hdr->cid && call->in_clientflag == in_clientflag && call->service_id == hdr->serviceId && - call->proto == proto && + call->family == family && call->local == localptr && memcmp(call->peer_ip.ipv6_addr, peer_addr, addr_size) == 0 && @@ -286,6 +286,7 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) */ static struct rxrpc_call *rxrpc_alloc_client_call( struct rxrpc_sock *rx, + struct rxrpc_conn_parameters *cp, struct rxrpc_transport *trans, struct rxrpc_conn_bundle *bundle, gfp_t gfp) @@ -307,16 +308,16 @@ static struct rxrpc_call *rxrpc_alloc_client_call( call->socket = rx; call->rx_data_post = 1; - ret = rxrpc_connect_call(rx, trans, bundle, call, gfp); + ret = rxrpc_connect_call(rx, cp, trans, bundle, call, gfp); if (ret < 0) { kmem_cache_free(rxrpc_call_jar, call); return ERR_PTR(ret); } /* Record copies of information for hashtable lookup */ - call->proto = rx->proto; - call->local = trans->local; - switch (call->proto) { + call->family = rx->family; + call->local = call->conn->params.local; + switch (call->family) { case AF_INET: call->peer_ip.ipv4_addr = trans->peer->srx.transport.sin.sin_addr.s_addr; @@ -327,9 +328,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call( sizeof(call->peer_ip.ipv6_addr)); break; } - call->epoch = call->conn->epoch; - call->service_id = call->conn->service_id; - call->in_clientflag = call->conn->in_clientflag; + call->epoch = call->conn->proto.epoch; + call->service_id = call->conn->params.service_id; + call->in_clientflag = call->conn->proto.in_clientflag; /* Add the new call to the hashtable */ rxrpc_call_hash_add(call); @@ -349,6 +350,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call( * - called in process context with IRQs enabled */ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, + struct rxrpc_conn_parameters *cp, struct rxrpc_transport *trans, struct rxrpc_conn_bundle *bundle, unsigned long user_call_ID, @@ -361,7 +363,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, rx, trans->debug_id, bundle ? bundle->debug_id : -1, user_call_ID); - call = rxrpc_alloc_client_call(rx, trans, bundle, gfp); + call = rxrpc_alloc_client_call(rx, cp, trans, bundle, gfp); if (IS_ERR(call)) { _leave(" = %ld", PTR_ERR(call)); return call; @@ -524,9 +526,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, write_unlock_bh(&rxrpc_call_lock); /* Record copies of information for hashtable lookup */ - call->proto = rx->proto; + call->family = rx->family; call->local = conn->trans->local; - switch (call->proto) { + switch (call->family) { case AF_INET: call->peer_ip.ipv4_addr = conn->trans->peer->srx.transport.sin.sin_addr.s_addr; @@ -539,9 +541,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, default: break; } - call->epoch = conn->epoch; - call->service_id = conn->service_id; - call->in_clientflag = conn->in_clientflag; + call->epoch = conn->proto.epoch; + call->service_id = conn->params.service_id; + call->in_clientflag = conn->proto.in_clientflag; /* Add the new call to the hashtable */ rxrpc_call_hash_add(call); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 00c92b614485..51e280c662e0 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -94,8 +94,8 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, msg.msg_controllen = 0; msg.msg_flags = 0; - whdr.epoch = htonl(conn->epoch); - whdr.cid = htonl(conn->cid); + whdr.epoch = htonl(conn->proto.epoch); + whdr.cid = htonl(conn->proto.cid); whdr.callNumber = 0; whdr.seq = 0; whdr.type = RXRPC_PACKET_TYPE_ABORT; @@ -103,7 +103,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, whdr.userStatus = 0; whdr.securityIndex = conn->security_ix; whdr._rsvd = 0; - whdr.serviceId = htons(conn->service_id); + whdr.serviceId = htons(conn->params.service_id); word = htonl(conn->local_abort); @@ -220,7 +220,7 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn) ASSERT(conn->security_ix != 0); - if (!conn->key) { + if (!conn->params.key) { _debug("set up security"); ret = rxrpc_init_server_conn_security(conn); switch (ret) { diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 8ecde4b77b55..c6787b6f459f 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -220,7 +220,7 @@ static void rxrpc_assign_connection_id(struct rxrpc_connection *conn) _enter(""); - epoch = conn->epoch; + epoch = conn->proto.epoch; write_lock_bh(&conn->trans->conn_lock); @@ -237,13 +237,13 @@ attempt_insertion: parent = *p; xconn = rb_entry(parent, struct rxrpc_connection, node); - if (epoch < xconn->epoch) + if (epoch < xconn->proto.epoch) p = &(*p)->rb_left; - else if (epoch > xconn->epoch) + else if (epoch > xconn->proto.epoch) p = &(*p)->rb_right; - else if (cid < xconn->cid) + else if (cid < xconn->proto.cid) p = &(*p)->rb_left; - else if (cid > xconn->cid) + else if (cid > xconn->proto.cid) p = &(*p)->rb_right; else goto id_exists; @@ -254,7 +254,7 @@ attempt_insertion: rb_link_node(&conn->node, parent, p); rb_insert_color(&conn->node, &conn->trans->client_conns); - conn->cid = cid; + conn->proto.cid = cid; write_unlock_bh(&conn->trans->conn_lock); _leave(" [CID %x]", cid); return; @@ -275,8 +275,8 @@ id_exists: goto attempt_insertion; xconn = rb_entry(parent, struct rxrpc_connection, node); - if (epoch < xconn->epoch || - cid < xconn->cid) + if (epoch < xconn->proto.epoch || + cid < xconn->proto.cid) goto attempt_insertion; } } @@ -318,8 +318,8 @@ static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, * connect a call on an exclusive connection */ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, + struct rxrpc_conn_parameters *cp, struct rxrpc_transport *trans, - u16 service_id, struct rxrpc_call *call, gfp_t gfp) { @@ -340,19 +340,21 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, conn->trans = trans; conn->bundle = NULL; - conn->service_id = service_id; - conn->epoch = rxrpc_epoch; - conn->in_clientflag = 0; + conn->params = *cp; + conn->proto.local = cp->local; + conn->proto.epoch = rxrpc_epoch; + conn->proto.cid = 0; + conn->proto.in_clientflag = 0; + conn->proto.family = cp->peer->srx.transport.family; conn->out_clientflag = RXRPC_CLIENT_INITIATED; - conn->cid = 0; conn->state = RXRPC_CONN_CLIENT; conn->avail_calls = RXRPC_MAXCALLS - 1; - conn->security_level = rx->min_sec_level; - conn->key = key_get(rx->key); + + key_get(conn->params.key); ret = rxrpc_init_client_conn_security(conn); if (ret < 0) { - key_put(conn->key); + key_put(conn->params.key); kfree(conn); _leave(" = %d [key]", ret); return ret; @@ -389,7 +391,7 @@ found_channel: conn->channels[chan] = call; call->conn = conn; call->channel = chan; - call->cid = conn->cid | chan; + call->cid = conn->proto.cid | chan; call->call_id = ++conn->call_counter; _net("CONNECT client on conn %d chan %d as call %x", @@ -412,6 +414,7 @@ no_free_channels: * - called in process context with IRQs enabled */ int rxrpc_connect_call(struct rxrpc_sock *rx, + struct rxrpc_conn_parameters *cp, struct rxrpc_transport *trans, struct rxrpc_conn_bundle *bundle, struct rxrpc_call *call, @@ -425,8 +428,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, _enter("%p,%lx,", rx, call->user_call_ID); if (test_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags)) - return rxrpc_connect_exclusive(rx, trans, bundle->service_id, - call, gfp); + return rxrpc_connect_exclusive(rx, cp, trans, call, gfp); spin_lock(&trans->client_lock); for (;;) { @@ -517,19 +519,21 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, candidate->trans = trans; candidate->bundle = bundle; - candidate->service_id = bundle->service_id; - candidate->epoch = rxrpc_epoch; - candidate->in_clientflag = 0; + candidate->params = *cp; + candidate->proto.local = cp->local; + candidate->proto.epoch = rxrpc_epoch; + candidate->proto.cid = 0; + candidate->proto.in_clientflag = 0; + candidate->proto.family = cp->peer->srx.transport.family; candidate->out_clientflag = RXRPC_CLIENT_INITIATED; - candidate->cid = 0; candidate->state = RXRPC_CONN_CLIENT; candidate->avail_calls = RXRPC_MAXCALLS; - candidate->security_level = rx->min_sec_level; - candidate->key = key_get(bundle->key); + + key_get(candidate->params.key); ret = rxrpc_init_client_conn_security(candidate); if (ret < 0) { - key_put(candidate->key); + key_put(candidate->params.key); kfree(candidate); _leave(" = %d [key]", ret); return ret; @@ -577,7 +581,7 @@ found_channel: conn->channels[chan] = call; call->conn = conn; call->channel = chan; - call->cid = conn->cid | chan; + call->cid = conn->proto.cid | chan; call->call_id = ++conn->call_counter; _net("CONNECT client on conn %d chan %d as call %x", @@ -626,15 +630,15 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, while (p) { conn = rb_entry(p, struct rxrpc_connection, node); - _debug("maybe %x", conn->cid); + _debug("maybe %x", conn->proto.cid); - if (epoch < conn->epoch) + if (epoch < conn->proto.epoch) p = p->rb_left; - else if (epoch > conn->epoch) + else if (epoch > conn->proto.epoch) p = p->rb_right; - else if (cid < conn->cid) + else if (cid < conn->proto.cid) p = p->rb_left; - else if (cid > conn->cid) + else if (cid > conn->proto.cid) p = p->rb_right; else goto found_extant_connection; @@ -650,14 +654,17 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, } candidate->trans = trans; - candidate->epoch = hdr->epoch; - candidate->cid = hdr->cid & RXRPC_CIDMASK; - candidate->service_id = hdr->serviceId; + candidate->proto.local = trans->local; + candidate->proto.epoch = hdr->epoch; + candidate->proto.cid = hdr->cid & RXRPC_CIDMASK; + candidate->proto.in_clientflag = RXRPC_CLIENT_INITIATED; + candidate->params.local = trans->local; + candidate->params.peer = trans->peer; + candidate->params.service_id = hdr->serviceId; candidate->security_ix = hdr->securityIndex; - candidate->in_clientflag = RXRPC_CLIENT_INITIATED; candidate->out_clientflag = 0; candidate->state = RXRPC_CONN_SERVER; - if (candidate->service_id) + if (candidate->params.service_id) candidate->state = RXRPC_CONN_SERVER_UNSECURED; write_lock_bh(&trans->conn_lock); @@ -668,13 +675,13 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, p = *pp; conn = rb_entry(p, struct rxrpc_connection, node); - if (epoch < conn->epoch) + if (epoch < conn->proto.epoch) pp = &(*pp)->rb_left; - else if (epoch > conn->epoch) + else if (epoch > conn->proto.epoch) pp = &(*pp)->rb_right; - else if (cid < conn->cid) + else if (cid < conn->proto.cid) pp = &(*pp)->rb_left; - else if (cid > conn->cid) + else if (cid > conn->proto.cid) pp = &(*pp)->rb_right; else goto found_extant_second; @@ -696,7 +703,7 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, new = "new"; success: - _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->cid); + _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid); _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); return conn; @@ -754,15 +761,15 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, while (p) { conn = rb_entry(p, struct rxrpc_connection, node); - _debug("maybe %x", conn->cid); + _debug("maybe %x", conn->proto.cid); - if (epoch < conn->epoch) + if (epoch < conn->proto.epoch) p = p->rb_left; - else if (epoch > conn->epoch) + else if (epoch > conn->proto.epoch) p = p->rb_right; - else if (cid < conn->cid) + else if (cid < conn->proto.cid) p = p->rb_left; - else if (cid > conn->cid) + else if (cid > conn->proto.cid) p = p->rb_right; else goto found; @@ -816,7 +823,7 @@ static void rxrpc_destroy_connection(struct rxrpc_connection *conn) rxrpc_purge_queue(&conn->rx_queue); conn->security->clear(conn); - key_put(conn->key); + key_put(conn->params.key); key_put(conn->server_key); rxrpc_put_transport(conn->trans); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index e11e4d785127..c030abd4d2d8 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -360,7 +360,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_PACKET_TYPE_BUSY: _proto("Rx BUSY %%%u", sp->hdr.serial); - if (call->conn->out_clientflag) + if (rxrpc_conn_is_service(call->conn)) goto protocol_error; write_lock_bh(&call->state_lock); @@ -533,7 +533,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call, case RXRPC_CALL_COMPLETE: case RXRPC_CALL_CLIENT_FINAL_ACK: /* complete server call */ - if (call->conn->in_clientflag) + if (rxrpc_conn_is_service(call->conn)) goto dead_call; /* resend last packet of a completed call */ _debug("final ack again"); diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 4ad56fafe3a7..18c737a61d80 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -987,7 +987,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn, if (ret < 0) goto error; - conn->key = key; + conn->params.key = key; _leave(" = 0 [%d]", key_serial(key)); return 0; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index e6fb3863b0bc..8c51745cccea 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -133,6 +133,7 @@ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, unsigned long user_call_ID) { + struct rxrpc_conn_parameters cp; struct rxrpc_conn_bundle *bundle; struct rxrpc_transport *trans; struct rxrpc_call *call; @@ -146,23 +147,32 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, if (!msg->msg_name) return ERR_PTR(-EDESTADDRREQ); - trans = rxrpc_name_to_transport(rx, msg->msg_name, msg->msg_namelen, 0, + key = rx->key; + if (key && !rx->key->payload.data[0]) + key = NULL; + + memset(&cp, 0, sizeof(cp)); + cp.local = rx->local; + cp.key = rx->key; + cp.security_level = rx->min_sec_level; + cp.exclusive = test_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags); + cp.service_id = srx->srx_service; + trans = rxrpc_name_to_transport(&cp, msg->msg_name, msg->msg_namelen, GFP_KERNEL); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } + cp.peer = trans->peer; - key = rx->key; - if (key && !rx->key->payload.data[0]) - key = NULL; - bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, GFP_KERNEL); + bundle = rxrpc_get_bundle(rx, trans, cp.key, srx->srx_service, + GFP_KERNEL); if (IS_ERR(bundle)) { ret = PTR_ERR(bundle); goto out_trans; } - call = rxrpc_new_client_call(rx, trans, bundle, user_call_ID, + call = rxrpc_new_client_call(rx, &cp, trans, bundle, user_call_ID, GFP_KERNEL); rxrpc_put_bundle(trans, bundle); rxrpc_put_transport(trans); @@ -664,7 +674,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, seq = atomic_inc_return(&call->sequence); - sp->hdr.epoch = conn->epoch; + sp->hdr.epoch = conn->proto.epoch; sp->hdr.cid = call->cid; sp->hdr.callNumber = call->call_id; sp->hdr.seq = seq; diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 225163bc658d..bbee05850801 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -74,10 +74,10 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) " %-8.8s %08x %lx\n", lbuff, rbuff, - call->conn->service_id, + call->conn->params.service_id, call->cid, call->call_id, - call->conn->in_clientflag ? "Svc" : "Clt", + rxrpc_conn_is_service(call->conn) ? "Svc" : "Clt", atomic_read(&call->usage), rxrpc_call_states[call->state], call->remote_abort ?: call->local_abort, @@ -157,13 +157,13 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) " %s %08x %08x %08x\n", lbuff, rbuff, - conn->service_id, - conn->cid, + conn->params.service_id, + conn->proto.cid, conn->call_counter, - conn->in_clientflag ? "Svc" : "Clt", + rxrpc_conn_is_service(conn) ? "Svc" : "Clt", atomic_read(&conn->usage), rxrpc_conn_states[conn->state], - key_serial(conn->key), + key_serial(conn->params.key), atomic_read(&conn->serial), atomic_read(&conn->hi_serial)); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 59706b9f2f7a..c5bac4e0db71 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -205,7 +205,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, /* we transferred the whole data packet */ if (sp->hdr.flags & RXRPC_LAST_PACKET) { _debug("last"); - if (call->conn->out_clientflag) { + if (rxrpc_conn_is_client(call->conn)) { /* last byte of reply received */ ret = copied; goto terminal_message; diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 36a634027d9d..134c2713ae23 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -58,9 +58,9 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) struct rxrpc_key_token *token; int ret; - _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key)); + _enter("{%d},{%x}", conn->debug_id, key_serial(conn->params.key)); - token = conn->key->payload.data[0]; + token = conn->params.key->payload.data[0]; conn->security_ix = token->security_index; ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); @@ -74,7 +74,7 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) sizeof(token->kad->session_key)) < 0) BUG(); - switch (conn->security_level) { + switch (conn->params.security_level) { case RXRPC_SECURITY_PLAIN: break; case RXRPC_SECURITY_AUTH: @@ -115,14 +115,14 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn) _enter(""); - if (!conn->key) + if (!conn->params.key) return; - token = conn->key->payload.data[0]; + token = conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - tmpbuf.x[0] = htonl(conn->epoch); - tmpbuf.x[1] = htonl(conn->cid); + tmpbuf.x[0] = htonl(conn->proto.epoch); + tmpbuf.x[1] = htonl(conn->proto.cid); tmpbuf.x[2] = 0; tmpbuf.x[3] = htonl(conn->security_ix); @@ -220,7 +220,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, rxkhdr.checksum = 0; /* encrypt from the session key */ - token = call->conn->key->payload.data[0]; + token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); sg_init_one(&sg[0], sechdr, sizeof(rxkhdr)); @@ -277,13 +277,13 @@ static int rxkad_secure_packet(const struct rxrpc_call *call, sp = rxrpc_skb(skb); _enter("{%d{%x}},{#%u},%zu,", - call->debug_id, key_serial(call->conn->key), sp->hdr.seq, - data_size); + call->debug_id, key_serial(call->conn->params.key), + sp->hdr.seq, data_size); if (!call->conn->cipher) return 0; - ret = key_validate(call->conn->key); + ret = key_validate(call->conn->params.key); if (ret < 0) return ret; @@ -312,7 +312,7 @@ static int rxkad_secure_packet(const struct rxrpc_call *call, y = 1; /* zero checksums are not permitted */ sp->hdr.cksum = y; - switch (call->conn->security_level) { + switch (call->conn->params.security_level) { case RXRPC_SECURITY_PLAIN: ret = 0; break; @@ -446,7 +446,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, skb_to_sgvec(skb, sg, 0, skb->len); /* decrypt from the session key */ - token = call->conn->key->payload.data[0]; + token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); skcipher_request_set_tfm(req, call->conn->cipher); @@ -516,7 +516,7 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, sp = rxrpc_skb(skb); _enter("{%d{%x}},{#%u}", - call->debug_id, key_serial(call->conn->key), sp->hdr.seq); + call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq); if (!call->conn->cipher) return 0; @@ -557,7 +557,7 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, return -EPROTO; } - switch (call->conn->security_level) { + switch (call->conn->params.security_level) { case RXRPC_SECURITY_PLAIN: ret = 0; break; @@ -589,9 +589,9 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) u32 serial; int ret; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key)); - ret = key_validate(conn->key); + ret = key_validate(conn->params.key); if (ret < 0) return ret; @@ -608,8 +608,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) msg.msg_controllen = 0; msg.msg_flags = 0; - whdr.epoch = htonl(conn->epoch); - whdr.cid = htonl(conn->cid); + whdr.epoch = htonl(conn->proto.epoch); + whdr.cid = htonl(conn->proto.cid); whdr.callNumber = 0; whdr.seq = 0; whdr.type = RXRPC_PACKET_TYPE_CHALLENGE; @@ -617,7 +617,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) whdr.userStatus = 0; whdr.securityIndex = conn->security_ix; whdr._rsvd = 0; - whdr.serviceId = htons(conn->service_id); + whdr.serviceId = htons(conn->params.service_id); iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); @@ -771,14 +771,14 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, u32 version, nonce, min_level, abort_code; int ret; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key)); - if (!conn->key) { + if (!conn->params.key) { _leave(" = -EPROTO [no key]"); return -EPROTO; } - ret = key_validate(conn->key); + ret = key_validate(conn->params.key); if (ret < 0) { *_abort_code = RXKADEXPIRED; return ret; @@ -801,20 +801,20 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, goto protocol_error; abort_code = RXKADLEVELFAIL; - if (conn->security_level < min_level) + if (conn->params.security_level < min_level) goto protocol_error; - token = conn->key->payload.data[0]; + token = conn->params.key->payload.data[0]; /* build the response packet */ memset(&resp, 0, sizeof(resp)); resp.version = htonl(RXKAD_VERSION); - resp.encrypted.epoch = htonl(conn->epoch); - resp.encrypted.cid = htonl(conn->cid); + resp.encrypted.epoch = htonl(conn->proto.epoch); + resp.encrypted.cid = htonl(conn->proto.cid); resp.encrypted.securityIndex = htonl(conn->security_ix); resp.encrypted.inc_nonce = htonl(nonce + 1); - resp.encrypted.level = htonl(conn->security_level); + resp.encrypted.level = htonl(conn->params.security_level); resp.kvno = htonl(token->kad->kvno); resp.ticket_len = htonl(token->kad->ticket_len); @@ -1096,9 +1096,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, rxkad_decrypt_response(conn, &response, &session_key); abort_code = RXKADSEALEDINCON; - if (ntohl(response.encrypted.epoch) != conn->epoch) + if (ntohl(response.encrypted.epoch) != conn->proto.epoch) goto protocol_error_free; - if (ntohl(response.encrypted.cid) != conn->cid) + if (ntohl(response.encrypted.cid) != conn->proto.cid) goto protocol_error_free; if (ntohl(response.encrypted.securityIndex) != conn->security_ix) goto protocol_error_free; @@ -1122,7 +1122,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, level = ntohl(response.encrypted.level); if (level > RXRPC_SECURITY_ENCRYPT) goto protocol_error_free; - conn->security_level = level; + conn->params.security_level = level; /* create a key to hold the security data and expiration time - after * this the connection security can be handled in exactly the same way diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index d223253b22fa..40955d0f2693 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -76,7 +76,7 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) { const struct rxrpc_security *sec; struct rxrpc_key_token *token; - struct key *key = conn->key; + struct key *key = conn->params.key; int ret; _enter("{%d},{%x}", conn->debug_id, key_serial(key)); @@ -121,7 +121,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) _enter(""); - sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix); + sprintf(kdesc, "%u:%u", conn->params.service_id, conn->security_ix); sec = rxrpc_security_lookup(conn->security_ix); if (!sec) { @@ -132,7 +132,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) /* find the service */ read_lock_bh(&local->services_lock); list_for_each_entry(rx, &local->services, listen_link) { - if (rx->srx.srx_service == conn->service_id) + if (rx->srx.srx_service == conn->params.service_id) goto found_service; } -- cgit v1.2.3 From 85f32278bd98fa89dff528b0baea4ae6eea4cc5d Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:36 +0100 Subject: rxrpc: Replace conn->trans->{local,peer} with conn->params.{local,peer} Replace accesses of conn->trans->{local,peer} with conn->params.{local,peer} thus making it easier for a future commit to remove the rxrpc_transport struct. This also reduces the number of memory accesses involved. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 12 ++++++------ net/rxrpc/call_object.c | 28 ++++++++++++++-------------- net/rxrpc/conn_event.c | 6 +++--- net/rxrpc/input.c | 2 +- net/rxrpc/output.c | 2 +- net/rxrpc/proc.c | 22 ++++++++++------------ net/rxrpc/recvmsg.c | 4 ++-- net/rxrpc/rxkad.c | 12 ++++++------ net/rxrpc/security.c | 2 +- 9 files changed, 44 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 1571dfb95aa3..b43faf573ed3 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -545,7 +545,7 @@ static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU)); - peer = call->conn->trans->peer; + peer = call->conn->params.peer; if (mtu < peer->maxdata) { spin_lock_bh(&peer->lock); peer->maxdata = mtu; @@ -836,8 +836,8 @@ void rxrpc_process_call(struct work_struct *work) /* there's a good chance we're going to have to send a message, so set * one up in advance */ - msg.msg_name = &call->conn->trans->peer->srx.transport; - msg.msg_namelen = call->conn->trans->peer->srx.transport_len; + msg.msg_name = &call->conn->params.peer->srx.transport; + msg.msg_namelen = call->conn->params.peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -1151,8 +1151,8 @@ send_ACK_with_skew: ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) - ntohl(ack.serial)); send_ACK: - mtu = call->conn->trans->peer->if_mtu; - mtu -= call->conn->trans->peer->hdrsize; + mtu = call->conn->params.peer->if_mtu; + mtu -= call->conn->params.peer->hdrsize; ackinfo.maxMTU = htonl(mtu); ackinfo.rwind = htonl(rxrpc_rx_window_size); @@ -1206,7 +1206,7 @@ send_message_2: len += iov[1].iov_len; } - ret = kernel_sendmsg(call->conn->trans->local->socket, + ret = kernel_sendmsg(call->conn->params.local->socket, &msg, iov, ioc, len); if (ret < 0) { _debug("sendmsg failed: %d", ret); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index b7c6011c71bb..5c2dceaf6a9c 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -122,7 +122,7 @@ static void rxrpc_call_hash_add(struct rxrpc_call *call) key = rxrpc_call_hashfunc(call->in_clientflag, call->cid, call->call_id, call->epoch, call->service_id, call->family, - call->conn->trans->local, addr_size, + call->conn->params.local, addr_size, call->peer_ip.ipv6_addr); /* Store the full key in the call */ call->hash_key = key; @@ -320,11 +320,11 @@ static struct rxrpc_call *rxrpc_alloc_client_call( switch (call->family) { case AF_INET: call->peer_ip.ipv4_addr = - trans->peer->srx.transport.sin.sin_addr.s_addr; + call->conn->params.peer->srx.transport.sin.sin_addr.s_addr; break; case AF_INET6: memcpy(call->peer_ip.ipv6_addr, - trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, + call->conn->params.peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, sizeof(call->peer_ip.ipv6_addr)); break; } @@ -334,9 +334,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call( /* Add the new call to the hashtable */ rxrpc_call_hash_add(call); - spin_lock(&call->conn->trans->peer->lock); - hlist_add_head(&call->error_link, &call->conn->trans->peer->error_targets); - spin_unlock(&call->conn->trans->peer->lock); + spin_lock(&call->conn->params.peer->lock); + hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets); + spin_unlock(&call->conn->params.peer->lock); call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; add_timer(&call->lifetimer); @@ -517,9 +517,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, atomic_inc(&conn->usage); write_unlock_bh(&conn->lock); - spin_lock(&conn->trans->peer->lock); - hlist_add_head(&call->error_link, &conn->trans->peer->error_targets); - spin_unlock(&conn->trans->peer->lock); + spin_lock(&conn->params.peer->lock); + hlist_add_head(&call->error_link, &conn->params.peer->error_targets); + spin_unlock(&conn->params.peer->lock); write_lock_bh(&rxrpc_call_lock); list_add_tail(&call->link, &rxrpc_calls); @@ -527,15 +527,15 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, /* Record copies of information for hashtable lookup */ call->family = rx->family; - call->local = conn->trans->local; + call->local = conn->params.local; switch (call->family) { case AF_INET: call->peer_ip.ipv4_addr = - conn->trans->peer->srx.transport.sin.sin_addr.s_addr; + conn->params.peer->srx.transport.sin.sin_addr.s_addr; break; case AF_INET6: memcpy(call->peer_ip.ipv6_addr, - conn->trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, + conn->params.peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, sizeof(call->peer_ip.ipv6_addr)); break; default: @@ -813,9 +813,9 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call) } if (call->conn) { - spin_lock(&call->conn->trans->peer->lock); + spin_lock(&call->conn->params.peer->lock); hlist_del_init(&call->error_link); - spin_unlock(&call->conn->trans->peer->lock); + spin_unlock(&call->conn->params.peer->lock); write_lock_bh(&call->conn->lock); rb_erase(&call->conn_node, &call->conn->calls); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 51e280c662e0..a022439f6f5a 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -88,8 +88,8 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code); - msg.msg_name = &conn->trans->peer->srx.transport; - msg.msg_namelen = conn->trans->peer->srx.transport_len; + msg.msg_name = &conn->params.peer->srx.transport; + msg.msg_namelen = conn->params.peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -118,7 +118,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, whdr.serial = htonl(serial); _proto("Tx CONN ABORT %%%u { %d }", serial, conn->local_abort); - ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); if (ret < 0) { _debug("sendmsg failed: %d", ret); return -EAGAIN; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index c030abd4d2d8..6af7f40c5030 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -560,7 +560,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call, dead_call: if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { skb->priority = RX_CALL_DEAD; - rxrpc_reject_packet(call->conn->trans->local, skb); + rxrpc_reject_packet(call->conn->params.local, skb); goto unlock; } free_unlock: diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8c51745cccea..becbaa7c0a7c 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -583,7 +583,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, goto maybe_error; } - max = call->conn->trans->peer->maxdata; + max = call->conn->params.peer->maxdata; max -= call->conn->security_size; max &= ~(call->conn->size_align - 1UL); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index bbee05850801..9863270691d7 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -46,7 +46,7 @@ static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) static int rxrpc_call_seq_show(struct seq_file *seq, void *v) { - struct rxrpc_transport *trans; + struct rxrpc_connection *conn; struct rxrpc_call *call; char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; @@ -59,15 +59,15 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) } call = list_entry(v, struct rxrpc_call, link); - trans = call->conn->trans; + conn = call->conn; sprintf(lbuff, "%pI4:%u", - &trans->local->srx.transport.sin.sin_addr, - ntohs(trans->local->srx.transport.sin.sin_port)); + &conn->params.local->srx.transport.sin.sin_addr, + ntohs(conn->params.local->srx.transport.sin.sin_port)); sprintf(rbuff, "%pI4:%u", - &trans->peer->srx.transport.sin.sin_addr, - ntohs(trans->peer->srx.transport.sin.sin_port)); + &conn->params.peer->srx.transport.sin.sin_addr, + ntohs(conn->params.peer->srx.transport.sin.sin_port)); seq_printf(seq, "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" @@ -129,7 +129,6 @@ static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) { struct rxrpc_connection *conn; - struct rxrpc_transport *trans; char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; if (v == &rxrpc_connections) { @@ -142,15 +141,14 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) } conn = list_entry(v, struct rxrpc_connection, link); - trans = conn->trans; sprintf(lbuff, "%pI4:%u", - &trans->local->srx.transport.sin.sin_addr, - ntohs(trans->local->srx.transport.sin.sin_port)); + &conn->params.local->srx.transport.sin.sin_addr, + ntohs(conn->params.local->srx.transport.sin.sin_port)); sprintf(rbuff, "%pI4:%u", - &trans->peer->srx.transport.sin.sin_addr, - ntohs(trans->peer->srx.transport.sin.sin_port)); + &conn->params.peer->srx.transport.sin.sin_addr, + ntohs(conn->params.peer->srx.transport.sin.sin_port)); seq_printf(seq, "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index c5bac4e0db71..a3fa2ed85d63 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -147,9 +147,9 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (!continue_call) { if (msg->msg_name) { size_t len = - sizeof(call->conn->trans->peer->srx); + sizeof(call->conn->params.peer->srx); memcpy(msg->msg_name, - &call->conn->trans->peer->srx, len); + &call->conn->params.peer->srx, len); msg->msg_namelen = len; } sock_recv_timestamp(msg, &rx->sk, skb); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 134c2713ae23..23c05ec6fa28 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -602,8 +602,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) challenge.min_level = htonl(0); challenge.__padding = 0; - msg.msg_name = &conn->trans->peer->srx.transport.sin; - msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin); + msg.msg_name = &conn->params.peer->srx.transport.sin; + msg.msg_namelen = sizeof(conn->params.peer->srx.transport.sin); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -630,7 +630,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) whdr.serial = htonl(serial); _proto("Tx CHALLENGE %%%u", serial); - ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); if (ret < 0) { _debug("sendmsg failed: %d", ret); return -EAGAIN; @@ -657,8 +657,8 @@ static int rxkad_send_response(struct rxrpc_connection *conn, _enter(""); - msg.msg_name = &conn->trans->peer->srx.transport.sin; - msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin); + msg.msg_name = &conn->params.peer->srx.transport.sin; + msg.msg_namelen = sizeof(conn->params.peer->srx.transport.sin); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -684,7 +684,7 @@ static int rxkad_send_response(struct rxrpc_connection *conn, whdr.serial = htonl(serial); _proto("Tx RESPONSE %%%u", serial); - ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len); + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len); if (ret < 0) { _debug("sendmsg failed: %d", ret); return -EAGAIN; diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 40955d0f2693..814d285ff802 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -113,7 +113,7 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) { const struct rxrpc_security *sec; - struct rxrpc_local *local = conn->trans->local; + struct rxrpc_local *local = conn->params.local; struct rxrpc_sock *rx; struct key *key; key_ref_t kref; -- cgit v1.2.3 From cc8feb8edd92d854be552fe4f5e0eeabca40b9ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:37 +0100 Subject: rxrpc: Fix exclusive connection handling "Exclusive connections" are meant to be used for a single client call and then scrapped. The idea is to limit the use of the negotiated security context. The current code, however, isn't doing this: it is instead restricting the socket to a single virtual connection and doing all the calls over that. This is changed such that the socket no longer maintains a special virtual connection over which it will do all the calls, but rather gets a new one each time a new exclusive call is made. Further, using a socket option for this is a poor choice. It should be done on sendmsg with a control message marker instead so that calls can be marked exclusive individually. To that end, add RXRPC_EXCLUSIVE_CALL which, if passed to sendmsg() as a control message element, will cause the call to be done on an single-use connection. The socket option (RXRPC_EXCLUSIVE_CONNECTION) still exists and, if set, will override any lack of RXRPC_EXCLUSIVE_CALL being specified so that programs using the setsockopt() will appear to work the same. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 7 +--- net/rxrpc/ar-internal.h | 7 ++-- net/rxrpc/conn_object.c | 97 ++++++++++++++++++++----------------------------- net/rxrpc/output.c | 19 +++++++--- 4 files changed, 58 insertions(+), 72 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 48b45a0280c0..73f5c553eef4 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -494,7 +494,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; - set_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags); + rx->exclusive = true; goto success; case RXRPC_SECURITY_KEY: @@ -669,11 +669,6 @@ static int rxrpc_release_sock(struct sock *sk) flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); - if (rx->conn) { - rxrpc_put_connection(rx->conn); - rx->conn = NULL; - } - if (rx->local) { rxrpc_put_local(rx->local); rx->local = NULL; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index efe6673deb28..4ca99445e0b7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -37,6 +37,8 @@ struct rxrpc_crypt { #define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor) #define rxrpc_queue_conn(CONN) rxrpc_queue_work(&(CONN)->processor) +struct rxrpc_connection; + /* * sk_state for RxRPC sockets */ @@ -57,7 +59,6 @@ struct rxrpc_sock { struct sock sk; rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */ struct rxrpc_local *local; /* local endpoint */ - struct rxrpc_connection *conn; /* exclusive virtual connection */ struct list_head listen_link; /* link in the local endpoint's listen list */ struct list_head secureq; /* calls awaiting connection security clearance */ struct list_head acceptq; /* calls awaiting acceptance */ @@ -66,13 +67,13 @@ struct rxrpc_sock { struct rb_root calls; /* outstanding calls on this socket */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ -#define RXRPC_SOCK_EXCLUSIVE_CONN 1 /* exclusive connection for a client socket */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT + bool exclusive; /* Exclusive connection for a client socket */ + sa_family_t family; /* Protocol family created with */ struct sockaddr_rxrpc srx; /* local address */ struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */ - sa_family_t family; /* protocol family created with */ }; #define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk) diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index c6787b6f459f..6164373d6ce3 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -328,71 +328,57 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, _enter(""); - conn = rx->conn; + conn = rxrpc_alloc_connection(gfp); if (!conn) { - /* not yet present - create a candidate for a new connection - * and then redo the check */ - conn = rxrpc_alloc_connection(gfp); - if (!conn) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } + _leave(" = -ENOMEM"); + return -ENOMEM; + } - conn->trans = trans; - conn->bundle = NULL; - conn->params = *cp; - conn->proto.local = cp->local; - conn->proto.epoch = rxrpc_epoch; - conn->proto.cid = 0; - conn->proto.in_clientflag = 0; - conn->proto.family = cp->peer->srx.transport.family; - conn->out_clientflag = RXRPC_CLIENT_INITIATED; - conn->state = RXRPC_CONN_CLIENT; - conn->avail_calls = RXRPC_MAXCALLS - 1; - - key_get(conn->params.key); - - ret = rxrpc_init_client_conn_security(conn); - if (ret < 0) { - key_put(conn->params.key); - kfree(conn); - _leave(" = %d [key]", ret); - return ret; - } + conn->trans = trans; + conn->bundle = NULL; + conn->params = *cp; + conn->proto.local = cp->local; + conn->proto.epoch = rxrpc_epoch; + conn->proto.cid = 0; + conn->proto.in_clientflag = 0; + conn->proto.family = cp->peer->srx.transport.family; + conn->out_clientflag = RXRPC_CLIENT_INITIATED; + conn->state = RXRPC_CONN_CLIENT; + conn->avail_calls = RXRPC_MAXCALLS - 1; + + key_get(conn->params.key); + + ret = rxrpc_init_client_conn_security(conn); + if (ret < 0) { + key_put(conn->params.key); + kfree(conn); + _leave(" = %d [key]", ret); + return ret; + } - write_lock_bh(&rxrpc_connection_lock); - list_add_tail(&conn->link, &rxrpc_connections); - write_unlock_bh(&rxrpc_connection_lock); + write_lock_bh(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + write_unlock_bh(&rxrpc_connection_lock); - spin_lock(&trans->client_lock); - atomic_inc(&trans->usage); + spin_lock(&trans->client_lock); + atomic_inc(&trans->usage); - _net("CONNECT EXCL new %d on TRANS %d", - conn->debug_id, conn->trans->debug_id); + _net("CONNECT EXCL new %d on TRANS %d", + conn->debug_id, conn->trans->debug_id); - rxrpc_assign_connection_id(conn); - rx->conn = conn; - } else { - spin_lock(&trans->client_lock); - } + rxrpc_assign_connection_id(conn); - /* we've got a connection with a free channel and we can now attach the - * call to it - * - we're holding the transport's client lock - * - we're holding a reference on the connection + /* Since no one else can use the connection, we just use the first + * channel. */ - for (chan = 0; chan < RXRPC_MAXCALLS; chan++) - if (!conn->channels[chan]) - goto found_channel; - goto no_free_channels; - -found_channel: + chan = 0; atomic_inc(&conn->usage); conn->channels[chan] = call; + conn->call_counter = 1; call->conn = conn; call->channel = chan; call->cid = conn->proto.cid | chan; - call->call_id = ++conn->call_counter; + call->call_id = 1; _net("CONNECT client on conn %d chan %d as call %x", conn->debug_id, chan, call->call_id); @@ -402,11 +388,6 @@ found_channel: rxrpc_add_call_ID_to_conn(conn, call); _leave(" = 0"); return 0; - -no_free_channels: - spin_unlock(&trans->client_lock); - _leave(" = -ENOSR"); - return -ENOSR; } /* @@ -427,7 +408,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, _enter("%p,%lx,", rx, call->user_call_ID); - if (test_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags)) + if (cp->exclusive) return rxrpc_connect_exclusive(rx, cp, trans, call, gfp); spin_lock(&trans->client_lock); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index becbaa7c0a7c..6f8ab0ef839f 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -35,7 +35,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, static int rxrpc_sendmsg_cmsg(struct msghdr *msg, unsigned long *user_call_ID, enum rxrpc_command *command, - u32 *abort_code) + u32 *abort_code, + bool *_exclusive) { struct cmsghdr *cmsg; bool got_user_ID = false; @@ -93,6 +94,11 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, return -EINVAL; break; + case RXRPC_EXCLUSIVE_CALL: + *_exclusive = true; + if (len != 0) + return -EINVAL; + break; default: return -EINVAL; } @@ -131,7 +137,7 @@ static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) */ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, - unsigned long user_call_ID) + unsigned long user_call_ID, bool exclusive) { struct rxrpc_conn_parameters cp; struct rxrpc_conn_bundle *bundle; @@ -155,7 +161,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, cp.local = rx->local; cp.key = rx->key; cp.security_level = rx->min_sec_level; - cp.exclusive = test_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags); + cp.exclusive = rx->exclusive | exclusive; cp.service_id = srx->srx_service; trans = rxrpc_name_to_transport(&cp, msg->msg_name, msg->msg_namelen, GFP_KERNEL); @@ -201,12 +207,14 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) enum rxrpc_command cmd; struct rxrpc_call *call; unsigned long user_call_ID = 0; + bool exclusive = false; u32 abort_code = 0; int ret; _enter(""); - ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code); + ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code, + &exclusive); if (ret < 0) return ret; @@ -224,7 +232,8 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (!call) { if (cmd != RXRPC_CMD_SEND_DATA) return -EBADSLT; - call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID); + call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, + exclusive); if (IS_ERR(call)) return PTR_ERR(call); } -- cgit v1.2.3 From 42886ffe77f142c36ecf585d60fff2edd06b5be8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Jun 2016 13:31:07 +0100 Subject: rxrpc: Pass sk_buff * rather than rxrpc_host_header * to functions Pass a pointer to struct sk_buff rather than struct rxrpc_host_header to functions so that they can in the future get at transport protocol parameters rather than just RxRPC parameters. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 6 +++--- net/rxrpc/call_accept.c | 4 ++-- net/rxrpc/call_object.c | 21 +++++++++++---------- net/rxrpc/conn_object.c | 49 +++++++++++++++++++++++++------------------------ net/rxrpc/input.c | 7 +++---- 5 files changed, 44 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 4ca99445e0b7..60ba22f56957 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -544,7 +544,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, unsigned long, gfp_t); struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_connection *, - struct rxrpc_host_header *); + struct sk_buff *); void rxrpc_release_call(struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); void __rxrpc_put_call(struct rxrpc_call *); @@ -574,9 +574,9 @@ int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, void rxrpc_put_connection(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *, - struct rxrpc_host_header *); + struct sk_buff *); extern struct rxrpc_connection * -rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *); +rxrpc_incoming_connection(struct rxrpc_transport *, struct sk_buff *); static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) { diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 553b67c144e5..5a70dc4e28c6 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -110,7 +110,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, goto error; } - conn = rxrpc_incoming_connection(trans, &sp->hdr); + conn = rxrpc_incoming_connection(trans, skb); rxrpc_put_transport(trans); if (IS_ERR(conn)) { _debug("no conn"); @@ -118,7 +118,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, goto error; } - call = rxrpc_incoming_call(rx, conn, &sp->hdr); + call = rxrpc_incoming_call(rx, conn, skb); rxrpc_put_connection(conn); if (IS_ERR(call)) { _debug("no call"); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 5c2dceaf6a9c..d83f2cbb80a9 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -421,8 +421,9 @@ found_user_ID_now_present: */ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_connection *conn, - struct rxrpc_host_header *hdr) + struct sk_buff *skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_call *call, *candidate; struct rb_node **p, *parent; u32 call_id; @@ -435,13 +436,13 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, if (!candidate) return ERR_PTR(-EBUSY); - candidate->socket = rx; - candidate->conn = conn; - candidate->cid = hdr->cid; - candidate->call_id = hdr->callNumber; - candidate->channel = hdr->cid & RXRPC_CHANNELMASK; - candidate->rx_data_post = 0; - candidate->state = RXRPC_CALL_SERVER_ACCEPTING; + candidate->socket = rx; + candidate->conn = conn; + candidate->cid = sp->hdr.cid; + candidate->call_id = sp->hdr.callNumber; + candidate->channel = sp->hdr.cid & RXRPC_CHANNELMASK; + candidate->rx_data_post = 0; + candidate->state = RXRPC_CALL_SERVER_ACCEPTING; if (conn->security_ix > 0) candidate->state = RXRPC_CALL_SERVER_SECURING; @@ -450,7 +451,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, /* set the channel for this call */ call = conn->channels[candidate->channel]; _debug("channel[%u] is %p", candidate->channel, call); - if (call && call->call_id == hdr->callNumber) { + if (call && call->call_id == sp->hdr.callNumber) { /* already set; must've been a duplicate packet */ _debug("extant call [%d]", call->state); ASSERTCMP(call->conn, ==, conn); @@ -488,7 +489,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, /* check the call number isn't duplicate */ _debug("check dup"); - call_id = hdr->callNumber; + call_id = sp->hdr.callNumber; p = &conn->calls.rb_node; parent = NULL; while (*p) { diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 6164373d6ce3..3b42fc4f4fe3 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -588,10 +588,10 @@ interrupted: * get a record of an incoming connection */ struct rxrpc_connection * -rxrpc_incoming_connection(struct rxrpc_transport *trans, - struct rxrpc_host_header *hdr) +rxrpc_incoming_connection(struct rxrpc_transport *trans, struct sk_buff *skb) { struct rxrpc_connection *conn, *candidate = NULL; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rb_node *p, **pp; const char *new = "old"; __be32 epoch; @@ -599,10 +599,10 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, _enter(""); - ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED); + ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED); - epoch = hdr->epoch; - cid = hdr->cid & RXRPC_CIDMASK; + epoch = sp->hdr.epoch; + cid = sp->hdr.cid & RXRPC_CIDMASK; /* search the connection list first */ read_lock_bh(&trans->conn_lock); @@ -634,19 +634,19 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, return ERR_PTR(-ENOMEM); } - candidate->trans = trans; - candidate->proto.local = trans->local; - candidate->proto.epoch = hdr->epoch; - candidate->proto.cid = hdr->cid & RXRPC_CIDMASK; - candidate->proto.in_clientflag = RXRPC_CLIENT_INITIATED; - candidate->params.local = trans->local; - candidate->params.peer = trans->peer; - candidate->params.service_id = hdr->serviceId; - candidate->security_ix = hdr->securityIndex; - candidate->out_clientflag = 0; - candidate->state = RXRPC_CONN_SERVER; + candidate->trans = trans; + candidate->proto.local = trans->local; + candidate->proto.epoch = sp->hdr.epoch; + candidate->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; + candidate->proto.in_clientflag = RXRPC_CLIENT_INITIATED; + candidate->params.local = trans->local; + candidate->params.peer = trans->peer; + candidate->params.service_id = sp->hdr.serviceId; + candidate->security_ix = sp->hdr.securityIndex; + candidate->out_clientflag = 0; + candidate->state = RXRPC_CONN_SERVER; if (candidate->params.service_id) - candidate->state = RXRPC_CONN_SERVER_UNSECURED; + candidate->state = RXRPC_CONN_SERVER_UNSECURED; write_lock_bh(&trans->conn_lock); @@ -691,7 +691,7 @@ success: /* we found the connection in the list immediately */ found_extant_connection: - if (hdr->securityIndex != conn->security_ix) { + if (sp->hdr.securityIndex != conn->security_ix) { read_unlock_bh(&trans->conn_lock); goto security_mismatch; } @@ -701,7 +701,7 @@ found_extant_connection: /* we found the connection on the second time through the list */ found_extant_second: - if (hdr->securityIndex != conn->security_ix) { + if (sp->hdr.securityIndex != conn->security_ix) { write_unlock_bh(&trans->conn_lock); goto security_mismatch; } @@ -721,20 +721,21 @@ security_mismatch: * packet */ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, - struct rxrpc_host_header *hdr) + struct sk_buff *skb) { struct rxrpc_connection *conn; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rb_node *p; u32 epoch, cid; - _enter(",{%x,%x}", hdr->cid, hdr->flags); + _enter(",{%x,%x}", sp->hdr.cid, sp->hdr.flags); read_lock_bh(&trans->conn_lock); - cid = hdr->cid & RXRPC_CIDMASK; - epoch = hdr->epoch; + cid = sp->hdr.cid & RXRPC_CIDMASK; + epoch = sp->hdr.epoch; - if (hdr->flags & RXRPC_CLIENT_INITIATED) + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) p = trans->server_conns.rb_node; else p = trans->client_conns.rb_node; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 6af7f40c5030..cf540efa9c17 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -628,8 +628,7 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) } static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, - struct sk_buff *skb, - struct rxrpc_skb_priv *sp) + struct sk_buff *skb) { struct rxrpc_peer *peer; struct rxrpc_transport *trans; @@ -647,7 +646,7 @@ static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, if (!trans) goto cant_find_conn; - conn = rxrpc_find_connection(trans, &sp->hdr); + conn = rxrpc_find_connection(trans, skb); rxrpc_put_transport(trans); if (!conn) goto cant_find_conn; @@ -739,7 +738,7 @@ void rxrpc_data_ready(struct sock *sk) * old-fashioned way doesn't really hurt */ struct rxrpc_connection *conn; - conn = rxrpc_conn_from_local(local, skb, sp); + conn = rxrpc_conn_from_local(local, skb); if (!conn) goto cant_route_call; -- cgit v1.2.3 From b3f575043fcd2926616a794db3f22280740fea6d Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 Jun 2016 16:10:03 +0100 Subject: rxrpc: rxrpc_connection_lock shouldn't be a BH lock, but conn_lock is rxrpc_connection_lock shouldn't be accessed as a BH-excluding lock. It's only accessed in a few places and none of those are in BH-context. rxrpc_transport::conn_lock, however, *is* a BH-excluding lock and should be accessed so consistently. Signed-off-by: David Howells --- net/rxrpc/conn_object.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 3b42fc4f4fe3..cab2f6dbc5a1 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -356,9 +356,9 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, return ret; } - write_lock_bh(&rxrpc_connection_lock); + write_lock(&rxrpc_connection_lock); list_add_tail(&conn->link, &rxrpc_connections); - write_unlock_bh(&rxrpc_connection_lock); + write_unlock(&rxrpc_connection_lock); spin_lock(&trans->client_lock); atomic_inc(&trans->usage); @@ -677,9 +677,9 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, struct sk_buff *skb) write_unlock_bh(&trans->conn_lock); - write_lock_bh(&rxrpc_connection_lock); + write_lock(&rxrpc_connection_lock); list_add_tail(&conn->link, &rxrpc_connections); - write_unlock_bh(&rxrpc_connection_lock); + write_unlock(&rxrpc_connection_lock); new = "new"; @@ -828,7 +828,7 @@ static void rxrpc_connection_reaper(struct work_struct *work) now = ktime_get_seconds(); earliest = ULONG_MAX; - write_lock_bh(&rxrpc_connection_lock); + write_lock(&rxrpc_connection_lock); list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) { _debug("reap CONN %d { u=%d,t=%ld }", conn->debug_id, atomic_read(&conn->usage), @@ -838,7 +838,7 @@ static void rxrpc_connection_reaper(struct work_struct *work) continue; spin_lock(&conn->trans->client_lock); - write_lock(&conn->trans->conn_lock); + write_lock_bh(&conn->trans->conn_lock); reap_time = conn->put_time + rxrpc_connection_expiry; if (atomic_read(&conn->usage) > 0) { @@ -860,10 +860,10 @@ static void rxrpc_connection_reaper(struct work_struct *work) earliest = reap_time; } - write_unlock(&conn->trans->conn_lock); + write_unlock_bh(&conn->trans->conn_lock); spin_unlock(&conn->trans->client_lock); } - write_unlock_bh(&rxrpc_connection_lock); + write_unlock(&rxrpc_connection_lock); if (earliest != ULONG_MAX) { _debug("reschedule reaper %ld", (long) earliest - now); -- cgit v1.2.3 From 4a3388c8033e4ea00f06a341d5ed4a20a7da89de Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:37 +0100 Subject: rxrpc: Use IDR to allocate client conn IDs on a machine-wide basis Use the IDR facility to allocate client connection IDs on a machine-wide basis so that each client connection has a unique identifier. When the connection ID space wraps, we advance the epoch by 1, thereby effectively having a 62-bit ID space. The IDR facility is then used to look up client connections during incoming packet routing instead of using an rbtree rooted on the transport. This change allows for the removal of the transport in the future and also means that client connections can be looked up directly in the data-ready handler by connection ID. The ID management code is placed in a new file, conn-client.c, to which all the client connection-specific code will eventually move. Note that the IDR tree gets very expensive on memory if the connection IDs are widely scattered throughout the number space, so we shall need to retire connections that have, say, an ID more than four times the maximum number of client conns away from the current allocation point to try and keep the IDs concentrated. We will also need to retire connections from an old epoch. Also note that, for the moment, a pointer to the transport has to be passed through into the ID allocation function so that we can take a BH lock to prevent a locking issue against in-BH lookup of client connections. This will go away later when RCU is used for server connections also. Signed-off-by: David Howells --- net/rxrpc/Makefile | 1 + net/rxrpc/af_rxrpc.c | 2 + net/rxrpc/ar-internal.h | 13 ++- net/rxrpc/conn_client.c | 99 +++++++++++++++++++++ net/rxrpc/conn_object.c | 231 +++++++++++++++++------------------------------- net/rxrpc/transport.c | 2 - 6 files changed, 196 insertions(+), 152 deletions(-) create mode 100644 net/rxrpc/conn_client.c (limited to 'net') diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index b005027f80cf..cfa221536f33 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -7,6 +7,7 @@ af-rxrpc-y := \ call_accept.o \ call_event.o \ call_object.o \ + conn_client.o \ conn_event.o \ conn_object.o \ input.o \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 73f5c553eef4..408bd024125b 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -858,6 +858,8 @@ static void __exit af_rxrpc_exit(void) _debug("synchronise RCU"); rcu_barrier(); _debug("destroy locals"); + ASSERT(idr_is_empty(&rxrpc_client_conn_ids)); + idr_destroy(&rxrpc_client_conn_ids); rxrpc_destroy_all_locals(); remove_proc_entry("rxrpc_conns", init_net.proc_net); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 60ba22f56957..89966508b26c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -233,7 +233,6 @@ struct rxrpc_transport { struct rxrpc_local *local; /* local transport endpoint */ struct rxrpc_peer *peer; /* remote transport endpoint */ struct rb_root bundles; /* client connection bundles on this transport */ - struct rb_root client_conns; /* client connections on this transport */ struct rb_root server_conns; /* server connections on this transport */ struct list_head link; /* link in master session list */ unsigned long put_time; /* time at which to reap */ @@ -241,7 +240,6 @@ struct rxrpc_transport { rwlock_t conn_lock; /* lock for active/dead connections */ atomic_t usage; int debug_id; /* debug ID for printks */ - unsigned int conn_idcounter; /* connection ID counter (client) */ }; /* @@ -312,6 +310,8 @@ struct rxrpc_connection { struct key *server_key; /* security for this service */ struct crypto_skcipher *cipher; /* encryption handle */ struct rxrpc_crypt csum_iv; /* packet checksum base */ + unsigned long flags; +#define RXRPC_CONN_HAS_IDR 0 /* - Has a client conn ID assigned */ unsigned long events; #define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */ unsigned long put_time; /* time at which to reap */ @@ -550,6 +550,15 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *); void __rxrpc_put_call(struct rxrpc_call *); void __exit rxrpc_destroy_all_calls(void); +/* + * conn_client.c + */ +extern struct idr rxrpc_client_conn_ids; + +int rxrpc_get_client_connection_id(struct rxrpc_connection *, + struct rxrpc_transport *, gfp_t); +void rxrpc_put_client_connection_id(struct rxrpc_connection *); + /* * conn_event.c */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c new file mode 100644 index 000000000000..2cccb4be289d --- /dev/null +++ b/net/rxrpc/conn_client.c @@ -0,0 +1,99 @@ +/* Client connection-specific management code. + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include "ar-internal.h" + +/* + * We use machine-unique IDs for our client connections. + */ +DEFINE_IDR(rxrpc_client_conn_ids); +static DEFINE_SPINLOCK(rxrpc_conn_id_lock); + +/* + * Get a connection ID and epoch for a client connection from the global pool. + * The connection struct pointer is then recorded in the idr radix tree. The + * epoch is changed if this wraps. + * + * TODO: The IDR tree gets very expensive on memory if the connection IDs are + * widely scattered throughout the number space, so we shall need to retire + * connections that have, say, an ID more than four times the maximum number of + * client conns away from the current allocation point to try and keep the IDs + * concentrated. We will also need to retire connections from an old epoch. + */ +int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, + struct rxrpc_transport *trans, + gfp_t gfp) +{ + u32 epoch; + int id; + + _enter(""); + + idr_preload(gfp); + write_lock_bh(&trans->conn_lock); + spin_lock(&rxrpc_conn_id_lock); + + epoch = rxrpc_epoch; + + /* We could use idr_alloc_cyclic() here, but we really need to know + * when the thing wraps so that we can advance the epoch. + */ + if (rxrpc_client_conn_ids.cur == 0) + rxrpc_client_conn_ids.cur = 1; + id = idr_alloc(&rxrpc_client_conn_ids, conn, + rxrpc_client_conn_ids.cur, 0x40000000, GFP_NOWAIT); + if (id < 0) { + if (id != -ENOSPC) + goto error; + id = idr_alloc(&rxrpc_client_conn_ids, conn, + 1, 0x40000000, GFP_NOWAIT); + if (id < 0) + goto error; + epoch++; + rxrpc_epoch = epoch; + } + rxrpc_client_conn_ids.cur = id + 1; + + spin_unlock(&rxrpc_conn_id_lock); + write_unlock_bh(&trans->conn_lock); + idr_preload_end(); + + conn->proto.epoch = epoch; + conn->proto.cid = id << RXRPC_CIDSHIFT; + set_bit(RXRPC_CONN_HAS_IDR, &conn->flags); + _leave(" [CID %x:%x]", epoch, conn->proto.cid); + return 0; + +error: + spin_unlock(&rxrpc_conn_id_lock); + write_unlock_bh(&trans->conn_lock); + idr_preload_end(); + _leave(" = %d", id); + return id; +} + +/* + * Release a connection ID for a client connection from the global pool. + */ +void rxrpc_put_client_connection_id(struct rxrpc_connection *conn) +{ + if (test_bit(RXRPC_CONN_HAS_IDR, &conn->flags)) { + spin_lock(&rxrpc_conn_id_lock); + idr_remove(&rxrpc_client_conn_ids, + conn->proto.cid >> RXRPC_CIDSHIFT); + spin_unlock(&rxrpc_conn_id_lock); + } +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index cab2f6dbc5a1..312b75091d29 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -206,81 +206,6 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) return conn; } -/* - * assign a connection ID to a connection and add it to the transport's - * connection lookup tree - * - called with transport client lock held - */ -static void rxrpc_assign_connection_id(struct rxrpc_connection *conn) -{ - struct rxrpc_connection *xconn; - struct rb_node *parent, **p; - __be32 epoch; - u32 cid; - - _enter(""); - - epoch = conn->proto.epoch; - - write_lock_bh(&conn->trans->conn_lock); - - conn->trans->conn_idcounter += RXRPC_CID_INC; - if (conn->trans->conn_idcounter < RXRPC_CID_INC) - conn->trans->conn_idcounter = RXRPC_CID_INC; - cid = conn->trans->conn_idcounter; - -attempt_insertion: - parent = NULL; - p = &conn->trans->client_conns.rb_node; - - while (*p) { - parent = *p; - xconn = rb_entry(parent, struct rxrpc_connection, node); - - if (epoch < xconn->proto.epoch) - p = &(*p)->rb_left; - else if (epoch > xconn->proto.epoch) - p = &(*p)->rb_right; - else if (cid < xconn->proto.cid) - p = &(*p)->rb_left; - else if (cid > xconn->proto.cid) - p = &(*p)->rb_right; - else - goto id_exists; - } - - /* we've found a suitable hole - arrange for this connection to occupy - * it */ - rb_link_node(&conn->node, parent, p); - rb_insert_color(&conn->node, &conn->trans->client_conns); - - conn->proto.cid = cid; - write_unlock_bh(&conn->trans->conn_lock); - _leave(" [CID %x]", cid); - return; - - /* we found a connection with the proposed ID - walk the tree from that - * point looking for the next unused ID */ -id_exists: - for (;;) { - cid += RXRPC_CID_INC; - if (cid < RXRPC_CID_INC) { - cid = RXRPC_CID_INC; - conn->trans->conn_idcounter = cid; - goto attempt_insertion; - } - - parent = rb_next(parent); - if (!parent) - goto attempt_insertion; - - xconn = rb_entry(parent, struct rxrpc_connection, node); - if (epoch < xconn->proto.epoch || - cid < xconn->proto.cid) - goto attempt_insertion; - } -} - /* * add a call to a connection's call-by-ID tree */ @@ -315,27 +240,24 @@ static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, } /* - * connect a call on an exclusive connection + * Allocate a client connection. */ -static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, - struct rxrpc_conn_parameters *cp, - struct rxrpc_transport *trans, - struct rxrpc_call *call, - gfp_t gfp) +static struct rxrpc_connection * +rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, + struct rxrpc_transport *trans, + gfp_t gfp) { struct rxrpc_connection *conn; - int chan, ret; + int ret; _enter(""); conn = rxrpc_alloc_connection(gfp); if (!conn) { _leave(" = -ENOMEM"); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } - conn->trans = trans; - conn->bundle = NULL; conn->params = *cp; conn->proto.local = cp->local; conn->proto.epoch = rxrpc_epoch; @@ -344,35 +266,75 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, conn->proto.family = cp->peer->srx.transport.family; conn->out_clientflag = RXRPC_CLIENT_INITIATED; conn->state = RXRPC_CONN_CLIENT; - conn->avail_calls = RXRPC_MAXCALLS - 1; - key_get(conn->params.key); + switch (conn->proto.family) { + case AF_INET: + conn->proto.addr_size = sizeof(conn->proto.ipv4_addr); + conn->proto.ipv4_addr = cp->peer->srx.transport.sin.sin_addr; + conn->proto.port = cp->peer->srx.transport.sin.sin_port; + break; + } + + ret = rxrpc_get_client_connection_id(conn, trans, gfp); + if (ret < 0) + goto error_0; ret = rxrpc_init_client_conn_security(conn); - if (ret < 0) { - key_put(conn->params.key); - kfree(conn); - _leave(" = %d [key]", ret); - return ret; - } + if (ret < 0) + goto error_1; + + conn->security->prime_packet_security(conn); write_lock(&rxrpc_connection_lock); list_add_tail(&conn->link, &rxrpc_connections); write_unlock(&rxrpc_connection_lock); - spin_lock(&trans->client_lock); + key_get(conn->params.key); + + _leave(" = %p", conn); + return conn; + +error_1: + rxrpc_put_client_connection_id(conn); +error_0: + kfree(conn); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * connect a call on an exclusive connection + */ +static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, + struct rxrpc_conn_parameters *cp, + struct rxrpc_transport *trans, + struct rxrpc_call *call, + gfp_t gfp) +{ + struct rxrpc_connection *conn; + int chan; + + _enter(""); + + conn = rxrpc_alloc_client_connection(cp, trans, gfp); + if (IS_ERR(conn)) { + _leave(" = %ld", PTR_ERR(conn)); + return PTR_ERR(conn); + } + atomic_inc(&trans->usage); + conn->trans = trans; + conn->bundle = NULL; _net("CONNECT EXCL new %d on TRANS %d", conn->debug_id, conn->trans->debug_id); - rxrpc_assign_connection_id(conn); - /* Since no one else can use the connection, we just use the first * channel. */ chan = 0; atomic_inc(&conn->usage); + conn->avail_calls = RXRPC_MAXCALLS - 1; conn->channels[chan] = call; conn->call_counter = 1; call->conn = conn; @@ -383,8 +345,6 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, _net("CONNECT client on conn %d chan %d as call %x", conn->debug_id, chan, call->call_id); - spin_unlock(&trans->client_lock); - rxrpc_add_call_ID_to_conn(conn, call); _leave(" = 0"); return 0; @@ -402,7 +362,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, gfp_t gfp) { struct rxrpc_connection *conn, *candidate; - int chan, ret; + int chan; DECLARE_WAITQUEUE(myself, current); @@ -492,51 +452,25 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, /* not yet present - create a candidate for a new connection and then * redo the check */ - candidate = rxrpc_alloc_connection(gfp); + candidate = rxrpc_alloc_client_connection(cp, trans, gfp); if (!candidate) { _leave(" = -ENOMEM"); return -ENOMEM; } + atomic_inc(&bundle->usage); + atomic_inc(&trans->usage); candidate->trans = trans; candidate->bundle = bundle; - candidate->params = *cp; - candidate->proto.local = cp->local; - candidate->proto.epoch = rxrpc_epoch; - candidate->proto.cid = 0; - candidate->proto.in_clientflag = 0; - candidate->proto.family = cp->peer->srx.transport.family; - candidate->out_clientflag = RXRPC_CLIENT_INITIATED; - candidate->state = RXRPC_CONN_CLIENT; - candidate->avail_calls = RXRPC_MAXCALLS; - - key_get(candidate->params.key); - - ret = rxrpc_init_client_conn_security(candidate); - if (ret < 0) { - key_put(candidate->params.key); - kfree(candidate); - _leave(" = %d [key]", ret); - return ret; - } - - write_lock_bh(&rxrpc_connection_lock); - list_add_tail(&candidate->link, &rxrpc_connections); - write_unlock_bh(&rxrpc_connection_lock); spin_lock(&trans->client_lock); list_add(&candidate->bundle_link, &bundle->unused_conns); bundle->num_conns++; - atomic_inc(&bundle->usage); - atomic_inc(&trans->usage); _net("CONNECT new %d on TRANS %d", candidate->debug_id, candidate->trans->debug_id); - rxrpc_assign_connection_id(candidate); - candidate->security->prime_packet_security(candidate); - /* leave the candidate lurking in zombie mode attached to the * bundle until we're ready for it */ rxrpc_put_connection(candidate); @@ -735,25 +669,27 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, cid = sp->hdr.cid & RXRPC_CIDMASK; epoch = sp->hdr.epoch; - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) { p = trans->server_conns.rb_node; - else - p = trans->client_conns.rb_node; - - while (p) { - conn = rb_entry(p, struct rxrpc_connection, node); - - _debug("maybe %x", conn->proto.cid); - - if (epoch < conn->proto.epoch) - p = p->rb_left; - else if (epoch > conn->proto.epoch) - p = p->rb_right; - else if (cid < conn->proto.cid) - p = p->rb_left; - else if (cid > conn->proto.cid) - p = p->rb_right; - else + while (p) { + conn = rb_entry(p, struct rxrpc_connection, node); + + _debug("maybe %x", conn->proto.cid); + + if (epoch < conn->proto.epoch) + p = p->rb_left; + else if (epoch > conn->proto.epoch) + p = p->rb_right; + else if (cid < conn->proto.cid) + p = p->rb_left; + else if (cid > conn->proto.cid) + p = p->rb_right; + else + goto found; + } + } else { + conn = idr_find(&rxrpc_client_conn_ids, cid >> RXRPC_CIDSHIFT); + if (conn && conn->proto.epoch == epoch) goto found; } @@ -846,8 +782,7 @@ static void rxrpc_connection_reaper(struct work_struct *work) } else if (reap_time <= now) { list_move_tail(&conn->link, &graveyard); if (conn->out_clientflag) - rb_erase(&conn->node, - &conn->trans->client_conns); + rxrpc_put_client_connection_id(conn); else rb_erase(&conn->node, &conn->trans->server_conns); diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index 24c71218a6f8..140628d94bb0 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c @@ -47,12 +47,10 @@ static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local, trans->peer = peer; INIT_LIST_HEAD(&trans->link); trans->bundles = RB_ROOT; - trans->client_conns = RB_ROOT; trans->server_conns = RB_ROOT; spin_lock_init(&trans->client_lock); rwlock_init(&trans->conn_lock); atomic_set(&trans->usage, 1); - trans->conn_idcounter = peer->srx.srx_service << 16; trans->debug_id = atomic_inc_return(&rxrpc_debug_id); } -- cgit v1.2.3 From f4552c2d248e9d9f6f728ea32eb25f600d3d6cd6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 17 Jun 2016 11:00:48 +0100 Subject: rxrpc: Validate the net address given to rxrpc_kernel_begin_call() Validate the net address given to rxrpc_kernel_begin_call() before using it. Whilst this should be mostly unnecessary for in-kernel users, it does clear the tail of the address struct in case we want to hash or compare the whole thing. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 408bd024125b..b29bb50af5de 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -280,9 +280,14 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_transport *trans; struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + int ret; _enter(",,%x,%lx", key_serial(key), user_call_ID); + ret = rxrpc_validate_address(rx, srx, sizeof(*srx)); + if (ret < 0) + return ERR_PTR(ret); + lock_sock(&rx->sk); if (!key) -- cgit v1.2.3 From f4e7da8cde87d0f7e9fb806918f7ec283912b694 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 17 Jun 2016 11:07:55 +0100 Subject: rxrpc: Calls displayed in /proc may in future lack a connection Allocated rxrpc calls displayed in /proc/net/rxrpc_calls may in future be on the proc list before they're connected or after they've been disconnected - in which case they may not have a pointer to a connection struct that can be used to get data from there. Deal with this by using stuff from the call struct in preference where possible and printing "no_connection" rather than a peer address if no connection is assigned. This change also has the added bonus that the service ID is now taken from the call rather the connection which will allow per-call service upgrades to be shown - something required for AuriStor server compatibility. Signed-off-by: David Howells --- net/rxrpc/proc.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 9863270691d7..500cdcdc843c 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -59,25 +59,28 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) } call = list_entry(v, struct rxrpc_call, link); - conn = call->conn; sprintf(lbuff, "%pI4:%u", - &conn->params.local->srx.transport.sin.sin_addr, - ntohs(conn->params.local->srx.transport.sin.sin_port)); + &call->local->srx.transport.sin.sin_addr, + ntohs(call->local->srx.transport.sin.sin_port)); - sprintf(rbuff, "%pI4:%u", - &conn->params.peer->srx.transport.sin.sin_addr, - ntohs(conn->params.peer->srx.transport.sin.sin_port)); + conn = call->conn; + if (conn) + sprintf(rbuff, "%pI4:%u", + &conn->params.peer->srx.transport.sin.sin_addr, + ntohs(conn->params.peer->srx.transport.sin.sin_port)); + else + strcpy(rbuff, "no_connection"); seq_printf(seq, "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" " %-8.8s %08x %lx\n", lbuff, rbuff, - call->conn->params.service_id, + call->service_id, call->cid, call->call_id, - rxrpc_conn_is_service(call->conn) ? "Svc" : "Clt", + call->in_clientflag ? "Svc" : "Clt", atomic_read(&call->usage), rxrpc_call_states[call->state], call->remote_abort ?: call->local_abort, -- cgit v1.2.3 From 985a5c824a52e9f7cae59c850e2db98954f21c7c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 17 Jun 2016 11:53:37 +0100 Subject: rxrpc: Make rxrpc_send_packet() take a connection not a transport Make rxrpc_send_packet() take a connection not a transport as part of the phasing out of the rxrpc_transport struct. Whilst we're at it, rename the function to rxrpc_send_data_packet() to differentiate it from the other packet sending functions. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 2 +- net/rxrpc/call_event.c | 2 +- net/rxrpc/output.c | 51 ++++++++++++++++++++++++++++--------------------- 3 files changed, 31 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 89966508b26c..cfbd028aa551 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -670,7 +670,7 @@ extern const char *rxrpc_acks(u8 reason); */ extern unsigned int rxrpc_resend_timeout; -int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); +int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *); int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index b43faf573ed3..0ba84295f913 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -187,7 +187,7 @@ static void rxrpc_resend(struct rxrpc_call *call) _proto("Tx DATA %%%u { #%d }", sp->hdr.serial, sp->hdr.seq); - if (rxrpc_send_packet(call->conn->trans, txb) < 0) { + if (rxrpc_send_data_packet(call->conn, txb) < 0) { stop = true; sp->resend_at = jiffies + 3; } else { diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 6f8ab0ef839f..db3933cf6b97 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -338,7 +338,7 @@ EXPORT_SYMBOL(rxrpc_kernel_abort_call); /* * send a packet through the transport endpoint */ -int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb) +int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb) { struct kvec iov[1]; struct msghdr msg; @@ -349,30 +349,30 @@ int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb) iov[0].iov_base = skb->head; iov[0].iov_len = skb->len; - msg.msg_name = &trans->peer->srx.transport.sin; - msg.msg_namelen = sizeof(trans->peer->srx.transport.sin); + msg.msg_name = &conn->params.peer->srx.transport; + msg.msg_namelen = conn->params.peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; /* send the packet with the don't fragment bit set if we currently * think it's small enough */ - if (skb->len - sizeof(struct rxrpc_wire_header) < trans->peer->maxdata) { - down_read(&trans->local->defrag_sem); + if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) { + down_read(&conn->params.local->defrag_sem); /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface * - in which case, we'll have processed the ICMP error * message and update the peer record */ - ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1, + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, iov[0].iov_len); - up_read(&trans->local->defrag_sem); + up_read(&conn->params.local->defrag_sem); if (ret == -EMSGSIZE) goto send_fragmentable; - _leave(" = %d [%u]", ret, trans->peer->maxdata); + _leave(" = %d [%u]", ret, conn->params.peer->maxdata); return ret; } @@ -380,21 +380,28 @@ send_fragmentable: /* attempt to send this message with fragmentation enabled */ _debug("send fragment"); - down_write(&trans->local->defrag_sem); - opt = IP_PMTUDISC_DONT; - ret = kernel_setsockopt(trans->local->socket, SOL_IP, IP_MTU_DISCOVER, - (char *) &opt, sizeof(opt)); - if (ret == 0) { - ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1, - iov[0].iov_len); - - opt = IP_PMTUDISC_DO; - kernel_setsockopt(trans->local->socket, SOL_IP, - IP_MTU_DISCOVER, (char *) &opt, sizeof(opt)); + down_write(&conn->params.local->defrag_sem); + + switch (conn->params.local->srx.transport.family) { + case AF_INET: + opt = IP_PMTUDISC_DONT; + ret = kernel_setsockopt(conn->params.local->socket, + SOL_IP, IP_MTU_DISCOVER, + (char *)&opt, sizeof(opt)); + if (ret == 0) { + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, + iov[0].iov_len); + + opt = IP_PMTUDISC_DO; + kernel_setsockopt(conn->params.local->socket, SOL_IP, + IP_MTU_DISCOVER, + (char *)&opt, sizeof(opt)); + } + break; } - up_write(&trans->local->defrag_sem); - _leave(" = %d [frag %u]", ret, trans->peer->maxdata); + up_write(&conn->params.local->defrag_sem); + _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata); return ret; } @@ -506,7 +513,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, if (try_to_del_timer_sync(&call->ack_timer) >= 0) { /* the packet may be freed by rxrpc_process_call() before this * returns */ - ret = rxrpc_send_packet(call->conn->trans, skb); + ret = rxrpc_send_data_packet(call->conn, skb); _net("sent skb %p", skb); } else { _debug("failed to delete ACK timer"); -- cgit v1.2.3 From 5627cc8b961e4b07d5d649d9bd01ac929dcc1a95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:38 +0100 Subject: rxrpc: Provide more refcount helper functions Provide refcount helper functions for connections so that the code doesn't touch local or connection usage counts directly. Also make it such that local and peer put functions can take a NULL pointer. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 7 ++----- net/rxrpc/ar-internal.h | 15 +++++++++++++-- net/rxrpc/call_accept.c | 2 +- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_event.c | 2 +- net/rxrpc/conn_object.c | 12 ++++++------ net/rxrpc/input.c | 2 +- net/rxrpc/local_object.c | 2 +- 8 files changed, 26 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index b29bb50af5de..57dcbfc061e4 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -674,11 +674,8 @@ static int rxrpc_release_sock(struct sock *sk) flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); - if (rx->local) { - rxrpc_put_local(rx->local); - rx->local = NULL; - } - + rxrpc_put_local(rx->local); + rx->local = NULL; key_put(rx->key); rx->key = NULL; key_put(rx->securities); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index cfbd028aa551..c0ed5e7f22ef 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -597,6 +597,17 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) return conn->proto.in_clientflag; } +static inline void rxrpc_get_connection(struct rxrpc_connection *conn) +{ + atomic_inc(&conn->usage); +} + +static inline +struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn) +{ + return atomic_inc_not_zero(&conn->usage) ? conn : NULL; +} + /* * input.c */ @@ -645,7 +656,7 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) static inline void rxrpc_put_local(struct rxrpc_local *local) { - if (atomic_dec_and_test(&local->usage)) + if (local && atomic_dec_and_test(&local->usage)) __rxrpc_put_local(local); } @@ -702,7 +713,7 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) extern void __rxrpc_put_peer(struct rxrpc_peer *peer); static inline void rxrpc_put_peer(struct rxrpc_peer *peer) { - if (atomic_dec_and_test(&peer->usage)) + if (peer && atomic_dec_and_test(&peer->usage)) __rxrpc_put_peer(peer); } diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 5a70dc4e28c6..833ad0622b61 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -141,7 +141,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, _debug("await conn sec"); list_add_tail(&call->accept_link, &rx->secureq); call->conn->state = RXRPC_CONN_SERVER_CHALLENGING; - atomic_inc(&call->conn->usage); + rxrpc_get_connection(call->conn); set_bit(RXRPC_CONN_CHALLENGE, &call->conn->events); rxrpc_queue_conn(call->conn); } else { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index d83f2cbb80a9..45849a66bc56 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -515,7 +515,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, rb_insert_color(&call->conn_node, &conn->calls); conn->channels[call->channel] = call; sock_hold(&rx->sk); - atomic_inc(&conn->usage); + rxrpc_get_connection(conn); write_unlock_bh(&conn->lock); spin_lock(&conn->params.peer->lock); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index a022439f6f5a..bf6971555eac 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -263,7 +263,7 @@ void rxrpc_process_connection(struct work_struct *work) _enter("{%d}", conn->debug_id); - atomic_inc(&conn->usage); + rxrpc_get_connection(conn); if (test_and_clear_bit(RXRPC_CONN_CHALLENGE, &conn->events)) { rxrpc_secure_connection(conn); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 312b75091d29..1754f2e2e16b 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -333,7 +333,7 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, * channel. */ chan = 0; - atomic_inc(&conn->usage); + rxrpc_get_connection(conn); conn->avail_calls = RXRPC_MAXCALLS - 1; conn->channels[chan] = call; conn->call_counter = 1; @@ -392,7 +392,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, conn->channels[1] == NULL || conn->channels[2] == NULL || conn->channels[3] == NULL); - atomic_inc(&conn->usage); + rxrpc_get_connection(conn); break; } @@ -412,7 +412,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, conn->channels[1] == NULL && conn->channels[2] == NULL && conn->channels[3] == NULL); - atomic_inc(&conn->usage); + rxrpc_get_connection(conn); list_move(&conn->bundle_link, &bundle->avail_conns); break; } @@ -629,7 +629,7 @@ found_extant_connection: read_unlock_bh(&trans->conn_lock); goto security_mismatch; } - atomic_inc(&conn->usage); + rxrpc_get_connection(conn); read_unlock_bh(&trans->conn_lock); goto success; @@ -639,7 +639,7 @@ found_extant_second: write_unlock_bh(&trans->conn_lock); goto security_mismatch; } - atomic_inc(&conn->usage); + rxrpc_get_connection(conn); write_unlock_bh(&trans->conn_lock); kfree(candidate); goto success; @@ -698,7 +698,7 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, return NULL; found: - atomic_inc(&conn->usage); + rxrpc_get_connection(conn); read_unlock_bh(&trans->conn_lock); _leave(" = %p", conn); return conn; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index cf540efa9c17..799aec18aa7b 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -580,7 +580,7 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, { _enter("%p,%p", conn, skb); - atomic_inc(&conn->usage); + rxrpc_get_connection(conn); skb_queue_tail(&conn->rx_queue, skb); rxrpc_queue_conn(conn); } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 009b321712bc..5703b0d18ed4 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -209,7 +209,7 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx) * bind the transport socket may still fail if we're attempting * to use a local address that the dying object is still using. */ - if (!atomic_inc_not_zero(&local->usage)) { + if (!rxrpc_get_local_maybe(local)) { cursor = cursor->next; list_del_init(&local->link); break; -- cgit v1.2.3 From 999b69f89241c9384c104b84329c13350fd696ef Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 17 Jun 2016 15:42:35 +0100 Subject: rxrpc: Kill the client connection bundle concept Kill off the concept of maintaining a bundle of connections to a particular target service to increase the number of call slots available for any beyond four for that service (there are four call slots per connection). This will make cleaning up the connection handling code easier and facilitate removal of the rxrpc_transport struct. Bundling can be reintroduced later if necessary. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 11 +- net/rxrpc/ar-internal.h | 57 ++--- net/rxrpc/call_object.c | 124 ++++++----- net/rxrpc/conn_client.c | 7 +- net/rxrpc/conn_object.c | 552 +++++++++++++++++------------------------------ net/rxrpc/local_object.c | 4 +- net/rxrpc/output.c | 11 +- net/rxrpc/transport.c | 2 - 8 files changed, 288 insertions(+), 480 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 57dcbfc061e4..f3b6ed8196c3 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -276,7 +276,6 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, gfp_t gfp) { struct rxrpc_conn_parameters cp; - struct rxrpc_conn_bundle *bundle; struct rxrpc_transport *trans; struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); @@ -311,15 +310,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, } cp.peer = trans->peer; - bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, gfp); - if (IS_ERR(bundle)) { - call = ERR_CAST(bundle); - goto out; - } - - call = rxrpc_new_client_call(rx, &cp, trans, bundle, user_call_ID, gfp); - rxrpc_put_bundle(trans, bundle); -out: + call = rxrpc_new_client_call(rx, &cp, trans, srx, user_call_ID, gfp); rxrpc_put_transport(trans); out_notrans: release_sock(&rx->sk); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c0ed5e7f22ef..26fe137d62bb 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -186,7 +186,8 @@ struct rxrpc_local { struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ - struct mutex conn_lock; /* Client connection creation lock */ + struct rb_root client_conns; /* Client connections by socket params */ + spinlock_t client_conns_lock; /* Lock for client_conns */ spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ int debug_id; /* debug ID for printks */ @@ -232,34 +233,14 @@ struct rxrpc_peer { struct rxrpc_transport { struct rxrpc_local *local; /* local transport endpoint */ struct rxrpc_peer *peer; /* remote transport endpoint */ - struct rb_root bundles; /* client connection bundles on this transport */ struct rb_root server_conns; /* server connections on this transport */ struct list_head link; /* link in master session list */ unsigned long put_time; /* time at which to reap */ - spinlock_t client_lock; /* client connection allocation lock */ rwlock_t conn_lock; /* lock for active/dead connections */ atomic_t usage; int debug_id; /* debug ID for printks */ }; -/* - * RxRPC client connection bundle - * - matched by { transport, service_id, key } - */ -struct rxrpc_conn_bundle { - struct rb_node node; /* node in transport's lookup tree */ - struct list_head unused_conns; /* unused connections in this bundle */ - struct list_head avail_conns; /* available connections in this bundle */ - struct list_head busy_conns; /* busy connections in this bundle */ - struct key *key; /* security for this bundle */ - wait_queue_head_t chanwait; /* wait for channel to become available */ - atomic_t usage; - int debug_id; /* debug ID for printks */ - unsigned short num_conns; /* number of connections in this bundle */ - u16 service_id; /* Service ID for this bundle */ - u8 security_ix; /* security type */ -}; - /* * Keys for matching a connection. */ @@ -295,17 +276,21 @@ struct rxrpc_conn_parameters { */ struct rxrpc_connection { struct rxrpc_transport *trans; /* transport session */ - struct rxrpc_conn_bundle *bundle; /* connection bundle (client) */ struct rxrpc_conn_proto proto; struct rxrpc_conn_parameters params; + spinlock_t channel_lock; + struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* active calls */ + wait_queue_head_t channel_wq; /* queue to wait for channel to become available */ + struct work_struct processor; /* connection event processor */ - struct rb_node node; /* node in transport's lookup tree */ + union { + struct rb_node client_node; /* Node in local->client_conns */ + struct rb_node service_node; /* Node in trans->server_conns */ + }; struct list_head link; /* link in master connection list */ - struct list_head bundle_link; /* link in bundle */ struct rb_root calls; /* calls on this connection */ struct sk_buff_head rx_queue; /* received conn-level packets */ - struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* channels (active calls) */ const struct rxrpc_security *security; /* applied security module */ struct key *server_key; /* security for this service */ struct crypto_skcipher *cipher; /* encryption handle */ @@ -314,7 +299,7 @@ struct rxrpc_connection { #define RXRPC_CONN_HAS_IDR 0 /* - Has a client conn ID assigned */ unsigned long events; #define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */ - unsigned long put_time; /* time at which to reap */ + unsigned long put_time; /* Time at which last put */ rwlock_t lock; /* access lock */ spinlock_t state_lock; /* state-change lock */ atomic_t usage; @@ -335,7 +320,7 @@ struct rxrpc_connection { unsigned int call_counter; /* call ID counter */ atomic_t serial; /* packet serial number counter */ atomic_t hi_serial; /* highest serial number received */ - u8 avail_calls; /* number of calls available */ + atomic_t avail_chans; /* number of channels available */ u8 size_align; /* data size alignment (for security) */ u8 header_size; /* rxrpc + security header size */ u8 security_size; /* security header size */ @@ -386,6 +371,8 @@ enum rxrpc_call_event { * The states that a call can be in. */ enum rxrpc_call_state { + RXRPC_CALL_UNINITIALISED, + RXRPC_CALL_CLIENT_AWAIT_CONN, /* - client waiting for connection to become available */ RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ @@ -540,7 +527,7 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct rxrpc_transport *, - struct rxrpc_conn_bundle *, + struct sockaddr_rxrpc *, unsigned long, gfp_t); struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_connection *, @@ -555,8 +542,7 @@ void __exit rxrpc_destroy_all_calls(void); */ extern struct idr rxrpc_client_conn_ids; -int rxrpc_get_client_connection_id(struct rxrpc_connection *, - struct rxrpc_transport *, gfp_t); +int rxrpc_get_client_connection_id(struct rxrpc_connection *, gfp_t); void rxrpc_put_client_connection_id(struct rxrpc_connection *); /* @@ -573,13 +559,10 @@ extern unsigned int rxrpc_connection_expiry; extern struct list_head rxrpc_connections; extern rwlock_t rxrpc_connection_lock; -struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *, - struct rxrpc_transport *, - struct key *, u16, gfp_t); -void rxrpc_put_bundle(struct rxrpc_transport *, struct rxrpc_conn_bundle *); -int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, - struct rxrpc_transport *, struct rxrpc_conn_bundle *, - struct rxrpc_call *, gfp_t); +int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, + struct rxrpc_transport *, + struct sockaddr_rxrpc *, gfp_t); +void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_put_connection(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *, diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 45849a66bc56..9b3b48abe12f 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -31,6 +31,8 @@ unsigned int rxrpc_max_call_lifetime = 60 * HZ; unsigned int rxrpc_dead_call_expiry = 2 * HZ; const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { + [RXRPC_CALL_UNINITIALISED] = "Uninit", + [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn", [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", @@ -261,6 +263,7 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) (unsigned long) call); INIT_WORK(&call->destroyer, &rxrpc_destroy_call); INIT_WORK(&call->processor, &rxrpc_process_call); + INIT_LIST_HEAD(&call->link); INIT_LIST_HEAD(&call->accept_link); skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->rx_oos_queue); @@ -269,7 +272,6 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) rwlock_init(&call->state_lock); atomic_set(&call->usage, 1); call->debug_id = atomic_inc_return(&rxrpc_debug_id); - call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; memset(&call->sock_node, 0xed, sizeof(call->sock_node)); @@ -282,55 +284,70 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) } /* - * allocate a new client call and attempt to get a connection slot for it + * Allocate a new client call. */ static struct rxrpc_call *rxrpc_alloc_client_call( struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, - struct rxrpc_transport *trans, - struct rxrpc_conn_bundle *bundle, + struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_call *call; - int ret; _enter(""); - ASSERT(rx != NULL); - ASSERT(trans != NULL); - ASSERT(bundle != NULL); + ASSERT(rx->local != NULL); call = rxrpc_alloc_call(gfp); if (!call) return ERR_PTR(-ENOMEM); + call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; sock_hold(&rx->sk); call->socket = rx; call->rx_data_post = 1; - ret = rxrpc_connect_call(rx, cp, trans, bundle, call, gfp); - if (ret < 0) { - kmem_cache_free(rxrpc_call_jar, call); - return ERR_PTR(ret); - } - /* Record copies of information for hashtable lookup */ call->family = rx->family; - call->local = call->conn->params.local; + call->local = rx->local; switch (call->family) { case AF_INET: - call->peer_ip.ipv4_addr = - call->conn->params.peer->srx.transport.sin.sin_addr.s_addr; + call->peer_ip.ipv4_addr = srx->transport.sin.sin_addr.s_addr; break; case AF_INET6: memcpy(call->peer_ip.ipv6_addr, - call->conn->params.peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, + srx->transport.sin6.sin6_addr.in6_u.u6_addr8, sizeof(call->peer_ip.ipv6_addr)); break; } - call->epoch = call->conn->proto.epoch; - call->service_id = call->conn->params.service_id; - call->in_clientflag = call->conn->proto.in_clientflag; + + call->service_id = srx->srx_service; + call->in_clientflag = 0; + + _leave(" = %p", call); + return call; +} + +/* + * Begin client call. + */ +static int rxrpc_begin_client_call(struct rxrpc_call *call, + struct rxrpc_conn_parameters *cp, + struct rxrpc_transport *trans, + struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + int ret; + + /* Set up or get a connection record and set the protocol parameters, + * including channel number and call ID. + */ + ret = rxrpc_connect_call(call, cp, trans, srx, gfp); + if (ret < 0) + return ret; + + call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; + /* Add the new call to the hashtable */ rxrpc_call_hash_add(call); @@ -340,9 +357,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call( call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; add_timer(&call->lifetimer); - - _leave(" = %p", call); - return call; + return 0; } /* @@ -352,23 +367,23 @@ static struct rxrpc_call *rxrpc_alloc_client_call( struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct rxrpc_transport *trans, - struct rxrpc_conn_bundle *bundle, + struct sockaddr_rxrpc *srx, unsigned long user_call_ID, gfp_t gfp) { struct rxrpc_call *call, *xcall; struct rb_node *parent, **pp; + int ret; - _enter("%p,%d,%d,%lx", - rx, trans->debug_id, bundle ? bundle->debug_id : -1, - user_call_ID); + _enter("%p,%lx", rx, user_call_ID); - call = rxrpc_alloc_client_call(rx, cp, trans, bundle, gfp); + call = rxrpc_alloc_client_call(rx, cp, srx, gfp); if (IS_ERR(call)) { _leave(" = %ld", PTR_ERR(call)); return call; } + /* Publish the call, even though it is incompletely set up as yet */ call->user_call_ID = user_call_ID; __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); @@ -398,11 +413,29 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, list_add_tail(&call->link, &rxrpc_calls); write_unlock_bh(&rxrpc_call_lock); + ret = rxrpc_begin_client_call(call, cp, trans, srx, gfp); + if (ret < 0) + goto error; + _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); _leave(" = %p [new]", call); return call; +error: + write_lock(&rx->call_lock); + rb_erase(&call->sock_node, &rx->calls); + write_unlock(&rx->call_lock); + rxrpc_put_call(call); + + write_lock_bh(&rxrpc_call_lock); + list_del(&call->link); + write_unlock_bh(&rxrpc_call_lock); + + rxrpc_put_call(call); + _leave(" = %d", ret); + return ERR_PTR(ret); + /* We unexpectedly found the user ID in the list after taking * the call_lock. This shouldn't happen unless the user races * with itself and tries to add the same user ID twice at the @@ -612,40 +645,13 @@ void rxrpc_release_call(struct rxrpc_call *call) write_unlock_bh(&rx->call_lock); /* free up the channel for reuse */ - spin_lock(&conn->trans->client_lock); + spin_lock(&conn->channel_lock); write_lock_bh(&conn->lock); write_lock(&call->state_lock); - if (conn->channels[call->channel] == call) - conn->channels[call->channel] = NULL; - - if (conn->out_clientflag && conn->bundle) { - conn->avail_calls++; - switch (conn->avail_calls) { - case 1: - list_move_tail(&conn->bundle_link, - &conn->bundle->avail_conns); - case 2 ... RXRPC_MAXCALLS - 1: - ASSERT(conn->channels[0] == NULL || - conn->channels[1] == NULL || - conn->channels[2] == NULL || - conn->channels[3] == NULL); - break; - case RXRPC_MAXCALLS: - list_move_tail(&conn->bundle_link, - &conn->bundle->unused_conns); - ASSERT(conn->channels[0] == NULL && - conn->channels[1] == NULL && - conn->channels[2] == NULL && - conn->channels[3] == NULL); - break; - default: - pr_err("conn->avail_calls=%d\n", conn->avail_calls); - BUG(); - } - } + rxrpc_disconnect_call(call); - spin_unlock(&conn->trans->client_lock); + spin_unlock(&conn->channel_lock); if (call->state < RXRPC_CALL_COMPLETE && call->state != RXRPC_CALL_CLIENT_FINAL_ACK) { diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 2cccb4be289d..82488d6adb83 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -33,9 +33,7 @@ static DEFINE_SPINLOCK(rxrpc_conn_id_lock); * client conns away from the current allocation point to try and keep the IDs * concentrated. We will also need to retire connections from an old epoch. */ -int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, - struct rxrpc_transport *trans, - gfp_t gfp) +int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, gfp_t gfp) { u32 epoch; int id; @@ -43,7 +41,6 @@ int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, _enter(""); idr_preload(gfp); - write_lock_bh(&trans->conn_lock); spin_lock(&rxrpc_conn_id_lock); epoch = rxrpc_epoch; @@ -68,7 +65,6 @@ int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, rxrpc_client_conn_ids.cur = id + 1; spin_unlock(&rxrpc_conn_id_lock); - write_unlock_bh(&trans->conn_lock); idr_preload_end(); conn->proto.epoch = epoch; @@ -79,7 +75,6 @@ int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, error: spin_unlock(&rxrpc_conn_id_lock); - write_unlock_bh(&trans->conn_lock); idr_preload_end(); _leave(" = %d", id); return id; diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 1754f2e2e16b..276ff505394f 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -31,152 +31,6 @@ LIST_HEAD(rxrpc_connections); DEFINE_RWLOCK(rxrpc_connection_lock); static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); -/* - * allocate a new client connection bundle - */ -static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp) -{ - struct rxrpc_conn_bundle *bundle; - - _enter(""); - - bundle = kzalloc(sizeof(struct rxrpc_conn_bundle), gfp); - if (bundle) { - INIT_LIST_HEAD(&bundle->unused_conns); - INIT_LIST_HEAD(&bundle->avail_conns); - INIT_LIST_HEAD(&bundle->busy_conns); - init_waitqueue_head(&bundle->chanwait); - atomic_set(&bundle->usage, 1); - } - - _leave(" = %p", bundle); - return bundle; -} - -/* - * compare bundle parameters with what we're looking for - * - return -ve, 0 or +ve - */ -static inline -int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle, - struct key *key, u16 service_id) -{ - return (bundle->service_id - service_id) ?: - ((unsigned long)bundle->key - (unsigned long)key); -} - -/* - * get bundle of client connections that a client socket can make use of - */ -struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx, - struct rxrpc_transport *trans, - struct key *key, - u16 service_id, - gfp_t gfp) -{ - struct rxrpc_conn_bundle *bundle, *candidate; - struct rb_node *p, *parent, **pp; - - _enter("%p{%x},%x,%hx,", - rx, key_serial(key), trans->debug_id, service_id); - - /* search the extant bundles first for one that matches the specified - * user ID */ - spin_lock(&trans->client_lock); - - p = trans->bundles.rb_node; - while (p) { - bundle = rb_entry(p, struct rxrpc_conn_bundle, node); - - if (rxrpc_cmp_bundle(bundle, key, service_id) < 0) - p = p->rb_left; - else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0) - p = p->rb_right; - else - goto found_extant_bundle; - } - - spin_unlock(&trans->client_lock); - - /* not yet present - create a candidate for a new record and then - * redo the search */ - candidate = rxrpc_alloc_bundle(gfp); - if (!candidate) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } - - candidate->key = key_get(key); - candidate->service_id = service_id; - - spin_lock(&trans->client_lock); - - pp = &trans->bundles.rb_node; - parent = NULL; - while (*pp) { - parent = *pp; - bundle = rb_entry(parent, struct rxrpc_conn_bundle, node); - - if (rxrpc_cmp_bundle(bundle, key, service_id) < 0) - pp = &(*pp)->rb_left; - else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0) - pp = &(*pp)->rb_right; - else - goto found_extant_second; - } - - /* second search also failed; add the new bundle */ - bundle = candidate; - candidate = NULL; - - rb_link_node(&bundle->node, parent, pp); - rb_insert_color(&bundle->node, &trans->bundles); - spin_unlock(&trans->client_lock); - _net("BUNDLE new on trans %d", trans->debug_id); - _leave(" = %p [new]", bundle); - return bundle; - - /* we found the bundle in the list immediately */ -found_extant_bundle: - atomic_inc(&bundle->usage); - spin_unlock(&trans->client_lock); - _net("BUNDLE old on trans %d", trans->debug_id); - _leave(" = %p [extant %d]", bundle, atomic_read(&bundle->usage)); - return bundle; - - /* we found the bundle on the second time through the list */ -found_extant_second: - atomic_inc(&bundle->usage); - spin_unlock(&trans->client_lock); - kfree(candidate); - _net("BUNDLE old2 on trans %d", trans->debug_id); - _leave(" = %p [second %d]", bundle, atomic_read(&bundle->usage)); - return bundle; -} - -/* - * release a bundle - */ -void rxrpc_put_bundle(struct rxrpc_transport *trans, - struct rxrpc_conn_bundle *bundle) -{ - _enter("%p,%p{%d}",trans, bundle, atomic_read(&bundle->usage)); - - if (atomic_dec_and_lock(&bundle->usage, &trans->client_lock)) { - _debug("Destroy bundle"); - rb_erase(&bundle->node, &trans->bundles); - spin_unlock(&trans->client_lock); - ASSERT(list_empty(&bundle->unused_conns)); - ASSERT(list_empty(&bundle->avail_conns)); - ASSERT(list_empty(&bundle->busy_conns)); - ASSERTCMP(bundle->num_conns, ==, 0); - key_put(bundle->key); - kfree(bundle); - } - - _leave(""); -} - /* * allocate a new connection */ @@ -188,8 +42,10 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) conn = kzalloc(sizeof(struct rxrpc_connection), gfp); if (conn) { + spin_lock_init(&conn->channel_lock); + init_waitqueue_head(&conn->channel_wq); INIT_WORK(&conn->processor, &rxrpc_process_connection); - INIT_LIST_HEAD(&conn->bundle_link); + INIT_LIST_HEAD(&conn->link); conn->calls = RB_ROOT; skb_queue_head_init(&conn->rx_queue); conn->security = &rxrpc_no_security; @@ -197,7 +53,7 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) spin_lock_init(&conn->state_lock); atomic_set(&conn->usage, 1); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); - conn->avail_calls = RXRPC_MAXCALLS; + atomic_set(&conn->avail_chans, RXRPC_MAXCALLS); conn->size_align = 4; conn->header_size = sizeof(struct rxrpc_wire_header); } @@ -240,7 +96,8 @@ static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, } /* - * Allocate a client connection. + * Allocate a client connection. The caller must take care to clear any + * padding bytes in *cp. */ static struct rxrpc_connection * rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, @@ -275,7 +132,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, break; } - ret = rxrpc_get_client_connection_id(conn, trans, gfp); + ret = rxrpc_get_client_connection_id(conn, gfp); if (ret < 0) goto error_0; @@ -290,6 +147,8 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, write_unlock(&rxrpc_connection_lock); key_get(conn->params.key); + conn->trans = trans; + atomic_inc(&trans->usage); _leave(" = %p", conn); return conn; @@ -302,218 +161,174 @@ error_0: return ERR_PTR(ret); } -/* - * connect a call on an exclusive connection - */ -static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, - struct rxrpc_conn_parameters *cp, - struct rxrpc_transport *trans, - struct rxrpc_call *call, - gfp_t gfp) -{ - struct rxrpc_connection *conn; - int chan; - - _enter(""); - - conn = rxrpc_alloc_client_connection(cp, trans, gfp); - if (IS_ERR(conn)) { - _leave(" = %ld", PTR_ERR(conn)); - return PTR_ERR(conn); - } - - atomic_inc(&trans->usage); - conn->trans = trans; - conn->bundle = NULL; - - _net("CONNECT EXCL new %d on TRANS %d", - conn->debug_id, conn->trans->debug_id); - - /* Since no one else can use the connection, we just use the first - * channel. - */ - chan = 0; - rxrpc_get_connection(conn); - conn->avail_calls = RXRPC_MAXCALLS - 1; - conn->channels[chan] = call; - conn->call_counter = 1; - call->conn = conn; - call->channel = chan; - call->cid = conn->proto.cid | chan; - call->call_id = 1; - - _net("CONNECT client on conn %d chan %d as call %x", - conn->debug_id, chan, call->call_id); - - rxrpc_add_call_ID_to_conn(conn, call); - _leave(" = 0"); - return 0; -} - /* * find a connection for a call * - called in process context with IRQs enabled */ -int rxrpc_connect_call(struct rxrpc_sock *rx, +int rxrpc_connect_call(struct rxrpc_call *call, struct rxrpc_conn_parameters *cp, struct rxrpc_transport *trans, - struct rxrpc_conn_bundle *bundle, - struct rxrpc_call *call, + struct sockaddr_rxrpc *srx, gfp_t gfp) { - struct rxrpc_connection *conn, *candidate; + struct rxrpc_connection *conn, *candidate = NULL; + struct rxrpc_local *local = cp->local; + struct rb_node *p, **pp, *parent; + long diff; int chan; DECLARE_WAITQUEUE(myself, current); - _enter("%p,%lx,", rx, call->user_call_ID); - - if (cp->exclusive) - return rxrpc_connect_exclusive(rx, cp, trans, call, gfp); - - spin_lock(&trans->client_lock); - for (;;) { - /* see if the bundle has a call slot available */ - if (!list_empty(&bundle->avail_conns)) { - _debug("avail"); - conn = list_entry(bundle->avail_conns.next, - struct rxrpc_connection, - bundle_link); - if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - list_del_init(&conn->bundle_link); - bundle->num_conns--; - continue; - } - if (--conn->avail_calls == 0) - list_move(&conn->bundle_link, - &bundle->busy_conns); - ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); - ASSERT(conn->channels[0] == NULL || - conn->channels[1] == NULL || - conn->channels[2] == NULL || - conn->channels[3] == NULL); - rxrpc_get_connection(conn); - break; - } + _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - if (!list_empty(&bundle->unused_conns)) { - _debug("unused"); - conn = list_entry(bundle->unused_conns.next, - struct rxrpc_connection, - bundle_link); - if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - list_del_init(&conn->bundle_link); - bundle->num_conns--; - continue; - } - ASSERTCMP(conn->avail_calls, ==, RXRPC_MAXCALLS); - conn->avail_calls = RXRPC_MAXCALLS - 1; - ASSERT(conn->channels[0] == NULL && - conn->channels[1] == NULL && - conn->channels[2] == NULL && - conn->channels[3] == NULL); - rxrpc_get_connection(conn); - list_move(&conn->bundle_link, &bundle->avail_conns); - break; + cp->peer = trans->peer; + rxrpc_get_peer(cp->peer); + + if (!cp->exclusive) { + /* Search for a existing client connection unless this is going + * to be a connection that's used exclusively for a single call. + */ + _debug("search 1"); + spin_lock(&local->client_conns_lock); + p = local->client_conns.rb_node; + while (p) { + conn = rb_entry(p, struct rxrpc_connection, client_node); + +#define cmp(X) ((long)conn->params.X - (long)cp->X) + diff = (cmp(peer) ?: + cmp(key) ?: + cmp(security_level)); + if (diff < 0) + p = p->rb_left; + else if (diff > 0) + p = p->rb_right; + else + goto found_extant_conn; } + spin_unlock(&local->client_conns_lock); + } - /* need to allocate a new connection */ - _debug("get new conn [%d]", bundle->num_conns); + /* We didn't find a connection or we want an exclusive one. */ + _debug("get new conn"); + candidate = rxrpc_alloc_client_connection(cp, trans, gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } - spin_unlock(&trans->client_lock); + if (cp->exclusive) { + /* Assign the call on an exclusive connection to channel 0 and + * don't add the connection to the endpoint's shareable conn + * lookup tree. + */ + _debug("exclusive chan 0"); + conn = candidate; + atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); + spin_lock(&conn->channel_lock); + chan = 0; + goto found_channel; + } - if (signal_pending(current)) - goto interrupted; + /* We need to redo the search before attempting to add a new connection + * lest we race with someone else adding a conflicting instance. + */ + _debug("search 2"); + spin_lock(&local->client_conns_lock); - if (bundle->num_conns >= 20) { - _debug("too many conns"); + pp = &local->client_conns.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + conn = rb_entry(parent, struct rxrpc_connection, client_node); - if (!gfpflags_allow_blocking(gfp)) { - _leave(" = -EAGAIN"); - return -EAGAIN; - } + diff = (cmp(peer) ?: + cmp(key) ?: + cmp(security_level)); + if (diff < 0) + pp = &(*pp)->rb_left; + else if (diff > 0) + pp = &(*pp)->rb_right; + else + goto found_extant_conn; + } - add_wait_queue(&bundle->chanwait, &myself); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (bundle->num_conns < 20 || - !list_empty(&bundle->unused_conns) || - !list_empty(&bundle->avail_conns)) - break; - if (signal_pending(current)) - goto interrupted_dequeue; - schedule(); - } - remove_wait_queue(&bundle->chanwait, &myself); - __set_current_state(TASK_RUNNING); - spin_lock(&trans->client_lock); - continue; - } + /* The second search also failed; simply add the new connection with + * the new call in channel 0. Note that we need to take the channel + * lock before dropping the client conn lock. + */ + _debug("new conn"); + conn = candidate; + candidate = NULL; - /* not yet present - create a candidate for a new connection and then - * redo the check */ - candidate = rxrpc_alloc_client_connection(cp, trans, gfp); - if (!candidate) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } + rb_link_node(&conn->client_node, parent, pp); + rb_insert_color(&conn->client_node, &local->client_conns); + + atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); + spin_lock(&conn->channel_lock); + spin_unlock(&local->client_conns_lock); + chan = 0; + +found_channel: + _debug("found chan"); + call->conn = conn; + call->channel = chan; + call->epoch = conn->proto.epoch; + call->cid = conn->proto.cid | chan; + call->call_id = ++conn->call_counter; + rcu_assign_pointer(conn->channels[chan], call); - atomic_inc(&bundle->usage); - atomic_inc(&trans->usage); - candidate->trans = trans; - candidate->bundle = bundle; + _net("CONNECT call %d on conn %d", call->debug_id, conn->debug_id); - spin_lock(&trans->client_lock); + rxrpc_add_call_ID_to_conn(conn, call); + spin_unlock(&conn->channel_lock); + _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); + return 0; + + /* We found a suitable connection already in existence. Discard any + * candidate we may have allocated, and try to get a channel on this + * one. + */ +found_extant_conn: + _debug("found conn"); + rxrpc_get_connection(conn); + spin_unlock(&local->client_conns_lock); - list_add(&candidate->bundle_link, &bundle->unused_conns); - bundle->num_conns++; + rxrpc_put_connection(candidate); - _net("CONNECT new %d on TRANS %d", - candidate->debug_id, candidate->trans->debug_id); + if (!atomic_add_unless(&conn->avail_chans, -1, 0)) { + if (!gfpflags_allow_blocking(gfp)) { + rxrpc_put_connection(conn); + _leave(" = -EAGAIN"); + return -EAGAIN; + } - /* leave the candidate lurking in zombie mode attached to the - * bundle until we're ready for it */ - rxrpc_put_connection(candidate); - candidate = NULL; + add_wait_queue(&conn->channel_wq, &myself); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (atomic_add_unless(&conn->avail_chans, -1, 0)) + break; + if (signal_pending(current)) + goto interrupted; + schedule(); + } + remove_wait_queue(&conn->channel_wq, &myself); + __set_current_state(TASK_RUNNING); } - /* we've got a connection with a free channel and we can now attach the - * call to it - * - we're holding the transport's client lock - * - we're holding a reference on the connection - * - we're holding a reference on the bundle + /* The connection allegedly now has a free channel and we can now + * attach the call to it. */ + spin_lock(&conn->channel_lock); + for (chan = 0; chan < RXRPC_MAXCALLS; chan++) if (!conn->channels[chan]) goto found_channel; - ASSERT(conn->channels[0] == NULL || - conn->channels[1] == NULL || - conn->channels[2] == NULL || - conn->channels[3] == NULL); BUG(); -found_channel: - conn->channels[chan] = call; - call->conn = conn; - call->channel = chan; - call->cid = conn->proto.cid | chan; - call->call_id = ++conn->call_counter; - - _net("CONNECT client on conn %d chan %d as call %x", - conn->debug_id, chan, call->call_id); - - ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); - spin_unlock(&trans->client_lock); - - rxrpc_add_call_ID_to_conn(conn, call); - - _leave(" = 0"); - return 0; - -interrupted_dequeue: - remove_wait_queue(&bundle->chanwait, &myself); - __set_current_state(TASK_RUNNING); interrupted: + remove_wait_queue(&conn->channel_wq, &myself); + __set_current_state(TASK_RUNNING); + rxrpc_put_connection(conn); _leave(" = -ERESTARTSYS"); return -ERESTARTSYS; } @@ -521,8 +336,8 @@ interrupted: /* * get a record of an incoming connection */ -struct rxrpc_connection * -rxrpc_incoming_connection(struct rxrpc_transport *trans, struct sk_buff *skb) +struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_transport *trans, + struct sk_buff *skb) { struct rxrpc_connection *conn, *candidate = NULL; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -543,7 +358,7 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, struct sk_buff *skb) p = trans->server_conns.rb_node; while (p) { - conn = rb_entry(p, struct rxrpc_connection, node); + conn = rb_entry(p, struct rxrpc_connection, service_node); _debug("maybe %x", conn->proto.cid); @@ -588,7 +403,7 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, struct sk_buff *skb) p = NULL; while (*pp) { p = *pp; - conn = rb_entry(p, struct rxrpc_connection, node); + conn = rb_entry(p, struct rxrpc_connection, service_node); if (epoch < conn->proto.epoch) pp = &(*pp)->rb_left; @@ -605,8 +420,8 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, struct sk_buff *skb) /* we can now add the new candidate to the list */ conn = candidate; candidate = NULL; - rb_link_node(&conn->node, p, pp); - rb_insert_color(&conn->node, &trans->server_conns); + rb_link_node(&conn->service_node, p, pp); + rb_insert_color(&conn->service_node, &trans->server_conns); atomic_inc(&conn->trans->usage); write_unlock_bh(&trans->conn_lock); @@ -672,7 +487,7 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) { p = trans->server_conns.rb_node; while (p) { - conn = rb_entry(p, struct rxrpc_connection, node); + conn = rb_entry(p, struct rxrpc_connection, service_node); _debug("maybe %x", conn->proto.cid); @@ -704,11 +519,32 @@ found: return conn; } +/* + * Disconnect a call and clear any channel it occupies when that call + * terminates. + */ +void rxrpc_disconnect_call(struct rxrpc_call *call) +{ + struct rxrpc_connection *conn = call->conn; + unsigned chan = call->channel; + + _enter("%d,%d", conn->debug_id, call->channel); + + if (conn->channels[chan] == call) { + rcu_assign_pointer(conn->channels[chan], NULL); + atomic_inc(&conn->avail_chans); + wake_up(&conn->channel_wq); + } +} + /* * release a virtual connection */ void rxrpc_put_connection(struct rxrpc_connection *conn) { + if (!conn) + return; + _enter("%p{u=%d,d=%d}", conn, atomic_read(&conn->usage), conn->debug_id); @@ -734,9 +570,6 @@ static void rxrpc_destroy_connection(struct rxrpc_connection *conn) _net("DESTROY CONN %d", conn->debug_id); - if (conn->bundle) - rxrpc_put_bundle(conn->trans, conn->bundle); - ASSERT(RB_EMPTY_ROOT(&conn->calls)); rxrpc_purge_queue(&conn->rx_queue); @@ -773,30 +606,39 @@ static void rxrpc_connection_reaper(struct work_struct *work) if (likely(atomic_read(&conn->usage) > 0)) continue; - spin_lock(&conn->trans->client_lock); - write_lock_bh(&conn->trans->conn_lock); - reap_time = conn->put_time + rxrpc_connection_expiry; + if (rxrpc_conn_is_client(conn)) { + struct rxrpc_local *local = conn->params.local; + spin_lock(&local->client_conns_lock); + reap_time = conn->put_time + rxrpc_connection_expiry; - if (atomic_read(&conn->usage) > 0) { - ; - } else if (reap_time <= now) { - list_move_tail(&conn->link, &graveyard); - if (conn->out_clientflag) + if (atomic_read(&conn->usage) > 0) { + ; + } else if (reap_time <= now) { + list_move_tail(&conn->link, &graveyard); rxrpc_put_client_connection_id(conn); - else - rb_erase(&conn->node, + rb_erase(&conn->client_node, + &local->client_conns); + } else if (reap_time < earliest) { + earliest = reap_time; + } + + spin_unlock(&local->client_conns_lock); + } else { + write_lock_bh(&conn->trans->conn_lock); + reap_time = conn->put_time + rxrpc_connection_expiry; + + if (atomic_read(&conn->usage) > 0) { + ; + } else if (reap_time <= now) { + list_move_tail(&conn->link, &graveyard); + rb_erase(&conn->service_node, &conn->trans->server_conns); - if (conn->bundle) { - list_del_init(&conn->bundle_link); - conn->bundle->num_conns--; + } else if (reap_time < earliest) { + earliest = reap_time; } - } else if (reap_time < earliest) { - earliest = reap_time; + write_unlock_bh(&conn->trans->conn_lock); } - - write_unlock_bh(&conn->trans->conn_lock); - spin_unlock(&conn->trans->client_lock); } write_unlock(&rxrpc_connection_lock); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 5703b0d18ed4..3ab7764f7cd8 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -80,7 +80,8 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx) skb_queue_head_init(&local->accept_queue); skb_queue_head_init(&local->reject_queue); skb_queue_head_init(&local->event_queue); - mutex_init(&local->conn_lock); + local->client_conns = RB_ROOT; + spin_lock_init(&local->client_conns_lock); spin_lock_init(&local->lock); rwlock_init(&local->services_lock); local->debug_id = atomic_inc_return(&rxrpc_debug_id); @@ -294,6 +295,7 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local) list_del_init(&local->link); mutex_unlock(&rxrpc_local_mutex); + ASSERT(RB_EMPTY_ROOT(&local->client_conns)); ASSERT(list_empty(&local->services)); if (socket) { diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index db3933cf6b97..8e24939aeac8 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -140,7 +140,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, unsigned long user_call_ID, bool exclusive) { struct rxrpc_conn_parameters cp; - struct rxrpc_conn_bundle *bundle; struct rxrpc_transport *trans; struct rxrpc_call *call; struct key *key; @@ -171,16 +170,8 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, } cp.peer = trans->peer; - bundle = rxrpc_get_bundle(rx, trans, cp.key, srx->srx_service, - GFP_KERNEL); - if (IS_ERR(bundle)) { - ret = PTR_ERR(bundle); - goto out_trans; - } - - call = rxrpc_new_client_call(rx, &cp, trans, bundle, user_call_ID, + call = rxrpc_new_client_call(rx, &cp, trans, srx, user_call_ID, GFP_KERNEL); - rxrpc_put_bundle(trans, bundle); rxrpc_put_transport(trans); if (IS_ERR(call)) { ret = PTR_ERR(call); diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index 140628d94bb0..71947402d071 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c @@ -46,9 +46,7 @@ static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local, trans->local = local; trans->peer = peer; INIT_LIST_HEAD(&trans->link); - trans->bundles = RB_ROOT; trans->server_conns = RB_ROOT; - spin_lock_init(&trans->client_lock); rwlock_init(&trans->conn_lock); atomic_set(&trans->usage, 1); trans->debug_id = atomic_inc_return(&rxrpc_debug_id); -- cgit v1.2.3 From aa390bbe2113dd0de99cf35c39d7701d4412b744 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 17 Jun 2016 10:06:56 +0100 Subject: rxrpc: Kill off the rxrpc_transport struct The rxrpc_transport struct is now redundant, given that the rxrpc_peer struct is now per peer port rather than per peer host, so get rid of it. Service connection lists are transferred to the rxrpc_peer struct, as is the conn_lock. Previous patches moved the client connection handling out of the rxrpc_transport struct and discarded the connection bundling code. Signed-off-by: David Howells --- net/rxrpc/Makefile | 1 - net/rxrpc/af_rxrpc.c | 46 +-------- net/rxrpc/ar-internal.h | 57 ++--------- net/rxrpc/call_accept.c | 11 +- net/rxrpc/call_object.c | 16 ++- net/rxrpc/conn_object.c | 78 +++++++------- net/rxrpc/input.c | 8 +- net/rxrpc/output.c | 24 +---- net/rxrpc/peer_object.c | 2 + net/rxrpc/sysctl.c | 8 -- net/rxrpc/transport.c | 265 ------------------------------------------------ 11 files changed, 65 insertions(+), 451 deletions(-) delete mode 100644 net/rxrpc/transport.c (limited to 'net') diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index cfa221536f33..6522e50fb750 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -22,7 +22,6 @@ af-rxrpc-y := \ recvmsg.o \ security.o \ skbuff.o \ - transport.o \ utils.o af-rxrpc-$(CONFIG_PROC_FS) += proc.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index f3b6ed8196c3..5d3e795a7c48 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -224,37 +224,6 @@ static int rxrpc_listen(struct socket *sock, int backlog) return ret; } -/* - * find a transport by address - */ -struct rxrpc_transport * -rxrpc_name_to_transport(struct rxrpc_conn_parameters *cp, - struct sockaddr *addr, - int addr_len, - gfp_t gfp) -{ - struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr; - struct rxrpc_transport *trans; - - _enter("%p,%d", addr, addr_len); - - if (cp->local->srx.transport_type != srx->transport_type) - return ERR_PTR(-ESOCKTNOSUPPORT); - if (cp->local->srx.transport.family != srx->transport.family) - return ERR_PTR(-EAFNOSUPPORT); - - /* find a remote transport endpoint from the local one */ - cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); - if (!cp->peer) - return ERR_PTR(-ENOMEM); - - /* find a transport */ - trans = rxrpc_get_transport(cp->local, cp->peer, gfp); - rxrpc_put_peer(cp->peer); - _leave(" = %p", trans); - return trans; -} - /** * rxrpc_kernel_begin_call - Allow a kernel service to begin a call * @sock: The socket on which to make the call @@ -276,7 +245,6 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, gfp_t gfp) { struct rxrpc_conn_parameters cp; - struct rxrpc_transport *trans; struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; @@ -300,19 +268,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, cp.security_level = 0; cp.exclusive = false; cp.service_id = srx->srx_service; + call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp); - trans = rxrpc_name_to_transport(&cp, (struct sockaddr *)srx, - sizeof(*srx), gfp); - if (IS_ERR(trans)) { - call = ERR_CAST(trans); - trans = NULL; - goto out_notrans; - } - cp.peer = trans->peer; - - call = rxrpc_new_client_call(rx, &cp, trans, srx, user_call_ID, gfp); - rxrpc_put_transport(trans); -out_notrans: release_sock(&rx->sk); _leave(" = %p", call); return call; @@ -831,7 +788,6 @@ static void __exit af_rxrpc_exit(void) proto_unregister(&rxrpc_proto); rxrpc_destroy_all_calls(); rxrpc_destroy_all_connections(); - rxrpc_destroy_all_transports(); ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 26fe137d62bb..702db72196fb 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -207,6 +207,8 @@ struct rxrpc_peer { struct rxrpc_local *local; struct hlist_head error_targets; /* targets for net error distribution */ struct work_struct error_distributor; + struct rb_root service_conns; /* Service connections */ + rwlock_t conn_lock; spinlock_t lock; /* access lock */ unsigned int if_mtu; /* interface MTU for this peer */ unsigned int mtu; /* network MTU for this peer */ @@ -225,22 +227,6 @@ struct rxrpc_peer { suseconds_t rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */ }; -/* - * RxRPC point-to-point transport / connection manager definition - * - handles a bundle of connections between two endpoints - * - matched by { local, peer } - */ -struct rxrpc_transport { - struct rxrpc_local *local; /* local transport endpoint */ - struct rxrpc_peer *peer; /* remote transport endpoint */ - struct rb_root server_conns; /* server connections on this transport */ - struct list_head link; /* link in master session list */ - unsigned long put_time; /* time at which to reap */ - rwlock_t conn_lock; /* lock for active/dead connections */ - atomic_t usage; - int debug_id; /* debug ID for printks */ -}; - /* * Keys for matching a connection. */ @@ -271,11 +257,10 @@ struct rxrpc_conn_parameters { /* * RxRPC connection definition - * - matched by { transport, service_id, conn_id, direction, key } + * - matched by { local, peer, epoch, conn_id, direction } * - each connection can only handle four simultaneous calls */ struct rxrpc_connection { - struct rxrpc_transport *trans; /* transport session */ struct rxrpc_conn_proto proto; struct rxrpc_conn_parameters params; @@ -286,7 +271,7 @@ struct rxrpc_connection { struct work_struct processor; /* connection event processor */ union { struct rb_node client_node; /* Node in local->client_conns */ - struct rb_node service_node; /* Node in trans->server_conns */ + struct rb_node service_node; /* Node in peer->service_conns */ }; struct list_head link; /* link in master connection list */ struct rb_root calls; /* calls on this connection */ @@ -494,10 +479,6 @@ extern u32 rxrpc_epoch; extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; -extern struct rxrpc_transport *rxrpc_name_to_transport(struct rxrpc_conn_parameters *, - struct sockaddr *, - int, gfp_t); - /* * call_accept.c */ @@ -526,7 +507,6 @@ struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *, struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, - struct rxrpc_transport *, struct sockaddr_rxrpc *, unsigned long, gfp_t); struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, @@ -560,15 +540,16 @@ extern struct list_head rxrpc_connections; extern rwlock_t rxrpc_connection_lock; int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, - struct rxrpc_transport *, struct sockaddr_rxrpc *, gfp_t); +struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *, + struct rxrpc_peer *, + struct sk_buff *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_put_connection(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); -struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *, - struct sk_buff *); -extern struct rxrpc_connection * -rxrpc_incoming_connection(struct rxrpc_transport *, struct sk_buff *); +struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *, + struct rxrpc_peer *, + struct sk_buff *); static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) { @@ -585,12 +566,6 @@ static inline void rxrpc_get_connection(struct rxrpc_connection *conn) atomic_inc(&conn->usage); } -static inline -struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn) -{ - return atomic_inc_not_zero(&conn->usage) ? conn : NULL; -} - /* * input.c */ @@ -744,18 +719,6 @@ static inline int __init rxrpc_sysctl_init(void) { return 0; } static inline void rxrpc_sysctl_exit(void) {} #endif -/* - * transport.c - */ -extern unsigned int rxrpc_transport_expiry; - -struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *, - struct rxrpc_peer *, gfp_t); -void rxrpc_put_transport(struct rxrpc_transport *); -void __exit rxrpc_destroy_all_transports(void); -struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *, - struct rxrpc_peer *); - /* * utils.c */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 833ad0622b61..202e053a3c6d 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -74,7 +74,6 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, struct sockaddr_rxrpc *srx) { struct rxrpc_connection *conn; - struct rxrpc_transport *trans; struct rxrpc_skb_priv *sp, *nsp; struct rxrpc_peer *peer; struct rxrpc_call *call; @@ -102,16 +101,8 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, goto error; } - trans = rxrpc_get_transport(local, peer, GFP_NOIO); + conn = rxrpc_incoming_connection(local, peer, skb); rxrpc_put_peer(peer); - if (IS_ERR(trans)) { - _debug("no trans"); - ret = -EBUSY; - goto error; - } - - conn = rxrpc_incoming_connection(trans, skb); - rxrpc_put_transport(trans); if (IS_ERR(conn)) { _debug("no conn"); ret = PTR_ERR(conn); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 9b3b48abe12f..ad933daae13b 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -286,11 +286,9 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) /* * Allocate a new client call. */ -static struct rxrpc_call *rxrpc_alloc_client_call( - struct rxrpc_sock *rx, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) +static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, + struct sockaddr_rxrpc *srx, + gfp_t gfp) { struct rxrpc_call *call; @@ -333,7 +331,6 @@ static struct rxrpc_call *rxrpc_alloc_client_call( */ static int rxrpc_begin_client_call(struct rxrpc_call *call, struct rxrpc_conn_parameters *cp, - struct rxrpc_transport *trans, struct sockaddr_rxrpc *srx, gfp_t gfp) { @@ -342,7 +339,7 @@ static int rxrpc_begin_client_call(struct rxrpc_call *call, /* Set up or get a connection record and set the protocol parameters, * including channel number and call ID. */ - ret = rxrpc_connect_call(call, cp, trans, srx, gfp); + ret = rxrpc_connect_call(call, cp, srx, gfp); if (ret < 0) return ret; @@ -366,7 +363,6 @@ static int rxrpc_begin_client_call(struct rxrpc_call *call, */ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, - struct rxrpc_transport *trans, struct sockaddr_rxrpc *srx, unsigned long user_call_ID, gfp_t gfp) @@ -377,7 +373,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, _enter("%p,%lx", rx, user_call_ID); - call = rxrpc_alloc_client_call(rx, cp, srx, gfp); + call = rxrpc_alloc_client_call(rx, srx, gfp); if (IS_ERR(call)) { _leave(" = %ld", PTR_ERR(call)); return call; @@ -413,7 +409,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, list_add_tail(&call->link, &rxrpc_calls); write_unlock_bh(&rxrpc_call_lock); - ret = rxrpc_begin_client_call(call, cp, trans, srx, gfp); + ret = rxrpc_begin_client_call(call, cp, srx, gfp); if (ret < 0) goto error; diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 276ff505394f..4bfad7cf96cb 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -100,9 +100,7 @@ static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, * padding bytes in *cp. */ static struct rxrpc_connection * -rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, - struct rxrpc_transport *trans, - gfp_t gfp) +rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) { struct rxrpc_connection *conn; int ret; @@ -146,9 +144,10 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, list_add_tail(&conn->link, &rxrpc_connections); write_unlock(&rxrpc_connection_lock); + /* We steal the caller's peer ref. */ + cp->peer = NULL; + rxrpc_get_local(conn->params.local); key_get(conn->params.key); - conn->trans = trans; - atomic_inc(&trans->usage); _leave(" = %p", conn); return conn; @@ -167,7 +166,6 @@ error_0: */ int rxrpc_connect_call(struct rxrpc_call *call, struct rxrpc_conn_parameters *cp, - struct rxrpc_transport *trans, struct sockaddr_rxrpc *srx, gfp_t gfp) { @@ -181,8 +179,9 @@ int rxrpc_connect_call(struct rxrpc_call *call, _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - cp->peer = trans->peer; - rxrpc_get_peer(cp->peer); + cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); + if (!cp->peer) + return -ENOMEM; if (!cp->exclusive) { /* Search for a existing client connection unless this is going @@ -210,7 +209,7 @@ int rxrpc_connect_call(struct rxrpc_call *call, /* We didn't find a connection or we want an exclusive one. */ _debug("get new conn"); - candidate = rxrpc_alloc_client_connection(cp, trans, gfp); + candidate = rxrpc_alloc_client_connection(cp, gfp); if (!candidate) { _leave(" = -ENOMEM"); return -ENOMEM; @@ -281,6 +280,8 @@ found_channel: rxrpc_add_call_ID_to_conn(conn, call); spin_unlock(&conn->channel_lock); + rxrpc_put_peer(cp->peer); + cp->peer = NULL; _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); return 0; @@ -329,6 +330,8 @@ interrupted: remove_wait_queue(&conn->channel_wq, &myself); __set_current_state(TASK_RUNNING); rxrpc_put_connection(conn); + rxrpc_put_peer(cp->peer); + cp->peer = NULL; _leave(" = -ERESTARTSYS"); return -ERESTARTSYS; } @@ -336,7 +339,8 @@ interrupted: /* * get a record of an incoming connection */ -struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_transport *trans, +struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, + struct rxrpc_peer *peer, struct sk_buff *skb) { struct rxrpc_connection *conn, *candidate = NULL; @@ -354,9 +358,9 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_transport *trans cid = sp->hdr.cid & RXRPC_CIDMASK; /* search the connection list first */ - read_lock_bh(&trans->conn_lock); + read_lock_bh(&peer->conn_lock); - p = trans->server_conns.rb_node; + p = peer->service_conns.rb_node; while (p) { conn = rb_entry(p, struct rxrpc_connection, service_node); @@ -373,7 +377,7 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_transport *trans else goto found_extant_connection; } - read_unlock_bh(&trans->conn_lock); + read_unlock_bh(&peer->conn_lock); /* not yet present - create a candidate for a new record and then * redo the search */ @@ -383,13 +387,12 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_transport *trans return ERR_PTR(-ENOMEM); } - candidate->trans = trans; - candidate->proto.local = trans->local; + candidate->proto.local = local; candidate->proto.epoch = sp->hdr.epoch; candidate->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; candidate->proto.in_clientflag = RXRPC_CLIENT_INITIATED; - candidate->params.local = trans->local; - candidate->params.peer = trans->peer; + candidate->params.local = local; + candidate->params.peer = peer; candidate->params.service_id = sp->hdr.serviceId; candidate->security_ix = sp->hdr.securityIndex; candidate->out_clientflag = 0; @@ -397,9 +400,9 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_transport *trans if (candidate->params.service_id) candidate->state = RXRPC_CONN_SERVER_UNSECURED; - write_lock_bh(&trans->conn_lock); + write_lock_bh(&peer->conn_lock); - pp = &trans->server_conns.rb_node; + pp = &peer->service_conns.rb_node; p = NULL; while (*pp) { p = *pp; @@ -421,10 +424,11 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_transport *trans conn = candidate; candidate = NULL; rb_link_node(&conn->service_node, p, pp); - rb_insert_color(&conn->service_node, &trans->server_conns); - atomic_inc(&conn->trans->usage); + rb_insert_color(&conn->service_node, &peer->service_conns); + rxrpc_get_peer(peer); + rxrpc_get_local(local); - write_unlock_bh(&trans->conn_lock); + write_unlock_bh(&peer->conn_lock); write_lock(&rxrpc_connection_lock); list_add_tail(&conn->link, &rxrpc_connections); @@ -441,21 +445,21 @@ success: /* we found the connection in the list immediately */ found_extant_connection: if (sp->hdr.securityIndex != conn->security_ix) { - read_unlock_bh(&trans->conn_lock); + read_unlock_bh(&peer->conn_lock); goto security_mismatch; } rxrpc_get_connection(conn); - read_unlock_bh(&trans->conn_lock); + read_unlock_bh(&peer->conn_lock); goto success; /* we found the connection on the second time through the list */ found_extant_second: if (sp->hdr.securityIndex != conn->security_ix) { - write_unlock_bh(&trans->conn_lock); + write_unlock_bh(&peer->conn_lock); goto security_mismatch; } rxrpc_get_connection(conn); - write_unlock_bh(&trans->conn_lock); + write_unlock_bh(&peer->conn_lock); kfree(candidate); goto success; @@ -469,7 +473,8 @@ security_mismatch: * find a connection based on transport and RxRPC connection ID for an incoming * packet */ -struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, +struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *local, + struct rxrpc_peer *peer, struct sk_buff *skb) { struct rxrpc_connection *conn; @@ -479,13 +484,13 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, _enter(",{%x,%x}", sp->hdr.cid, sp->hdr.flags); - read_lock_bh(&trans->conn_lock); + read_lock_bh(&peer->conn_lock); cid = sp->hdr.cid & RXRPC_CIDMASK; epoch = sp->hdr.epoch; if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) { - p = trans->server_conns.rb_node; + p = peer->service_conns.rb_node; while (p) { conn = rb_entry(p, struct rxrpc_connection, service_node); @@ -508,13 +513,13 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, goto found; } - read_unlock_bh(&trans->conn_lock); + read_unlock_bh(&peer->conn_lock); _leave(" = NULL"); return NULL; found: rxrpc_get_connection(conn); - read_unlock_bh(&trans->conn_lock); + read_unlock_bh(&peer->conn_lock); _leave(" = %p", conn); return conn; } @@ -576,8 +581,9 @@ static void rxrpc_destroy_connection(struct rxrpc_connection *conn) conn->security->clear(conn); key_put(conn->params.key); key_put(conn->server_key); + rxrpc_put_peer(conn->params.peer); + rxrpc_put_local(conn->params.local); - rxrpc_put_transport(conn->trans); kfree(conn); _leave(""); } @@ -588,6 +594,7 @@ static void rxrpc_destroy_connection(struct rxrpc_connection *conn) static void rxrpc_connection_reaper(struct work_struct *work) { struct rxrpc_connection *conn, *_p; + struct rxrpc_peer *peer; unsigned long now, earliest, reap_time; LIST_HEAD(graveyard); @@ -624,7 +631,8 @@ static void rxrpc_connection_reaper(struct work_struct *work) spin_unlock(&local->client_conns_lock); } else { - write_lock_bh(&conn->trans->conn_lock); + peer = conn->params.peer; + write_lock_bh(&peer->conn_lock); reap_time = conn->put_time + rxrpc_connection_expiry; if (atomic_read(&conn->usage) > 0) { @@ -632,12 +640,12 @@ static void rxrpc_connection_reaper(struct work_struct *work) } else if (reap_time <= now) { list_move_tail(&conn->link, &graveyard); rb_erase(&conn->service_node, - &conn->trans->server_conns); + &peer->service_conns); } else if (reap_time < earliest) { earliest = reap_time; } - write_unlock_bh(&conn->trans->conn_lock); + write_unlock_bh(&peer->conn_lock); } } write_unlock(&rxrpc_connection_lock); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 799aec18aa7b..f4bd57b77b93 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -631,7 +631,6 @@ static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, struct sk_buff *skb) { struct rxrpc_peer *peer; - struct rxrpc_transport *trans; struct rxrpc_connection *conn; struct sockaddr_rxrpc srx; @@ -641,13 +640,8 @@ static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, if (!peer) goto cant_find_peer; - trans = rxrpc_find_transport(local, peer); + conn = rxrpc_find_connection(local, peer, skb); rcu_read_unlock(); - if (!trans) - goto cant_find_conn; - - conn = rxrpc_find_connection(trans, skb); - rxrpc_put_transport(trans); if (!conn) goto cant_find_conn; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8e24939aeac8..f4bda06b7d2d 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -140,10 +140,8 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, unsigned long user_call_ID, bool exclusive) { struct rxrpc_conn_parameters cp; - struct rxrpc_transport *trans; struct rxrpc_call *call; struct key *key; - long ret; DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); @@ -162,30 +160,10 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, cp.security_level = rx->min_sec_level; cp.exclusive = rx->exclusive | exclusive; cp.service_id = srx->srx_service; - trans = rxrpc_name_to_transport(&cp, msg->msg_name, msg->msg_namelen, - GFP_KERNEL); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - cp.peer = trans->peer; - - call = rxrpc_new_client_call(rx, &cp, trans, srx, user_call_ID, - GFP_KERNEL); - rxrpc_put_transport(trans); - if (IS_ERR(call)) { - ret = PTR_ERR(call); - goto out_trans; - } + call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL); _leave(" = %p\n", call); return call; - -out_trans: - rxrpc_put_transport(trans); -out: - _leave(" = %ld", ret); - return ERR_PTR(ret); } /* diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 6baad708f3b1..01d4930a11f7 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -188,6 +188,8 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) INIT_HLIST_HEAD(&peer->error_targets); INIT_WORK(&peer->error_distributor, &rxrpc_peer_error_distributor); + peer->service_conns = RB_ROOT; + rwlock_init(&peer->conn_lock); spin_lock_init(&peer->lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); } diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index a99690a8a3da..03ad08774d4e 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -90,14 +90,6 @@ static struct ctl_table rxrpc_sysctl_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, }, - { - .procname = "transport_expiry", - .data = &rxrpc_transport_expiry, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&one, - }, /* Non-time values */ { diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c deleted file mode 100644 index 71947402d071..000000000000 --- a/net/rxrpc/transport.c +++ /dev/null @@ -1,265 +0,0 @@ -/* RxRPC point-to-point transport session management - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include "ar-internal.h" - -/* - * Time after last use at which transport record is cleaned up. - */ -unsigned int rxrpc_transport_expiry = 3600 * 24; - -static void rxrpc_transport_reaper(struct work_struct *work); - -static LIST_HEAD(rxrpc_transports); -static DEFINE_RWLOCK(rxrpc_transport_lock); -static DECLARE_DELAYED_WORK(rxrpc_transport_reap, rxrpc_transport_reaper); - -/* - * allocate a new transport session manager - */ -static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local, - struct rxrpc_peer *peer, - gfp_t gfp) -{ - struct rxrpc_transport *trans; - - _enter(""); - - trans = kzalloc(sizeof(struct rxrpc_transport), gfp); - if (trans) { - trans->local = local; - trans->peer = peer; - INIT_LIST_HEAD(&trans->link); - trans->server_conns = RB_ROOT; - rwlock_init(&trans->conn_lock); - atomic_set(&trans->usage, 1); - trans->debug_id = atomic_inc_return(&rxrpc_debug_id); - } - - _leave(" = %p", trans); - return trans; -} - -/* - * obtain a transport session for the nominated endpoints - */ -struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *local, - struct rxrpc_peer *peer, - gfp_t gfp) -{ - struct rxrpc_transport *trans, *candidate; - const char *new = "old"; - int usage; - - _enter("{%pI4+%hu},{%pI4+%hu},", - &local->srx.transport.sin.sin_addr, - ntohs(local->srx.transport.sin.sin_port), - &peer->srx.transport.sin.sin_addr, - ntohs(peer->srx.transport.sin.sin_port)); - - /* search the transport list first */ - read_lock_bh(&rxrpc_transport_lock); - list_for_each_entry(trans, &rxrpc_transports, link) { - if (trans->local == local && trans->peer == peer) - goto found_extant_transport; - } - read_unlock_bh(&rxrpc_transport_lock); - - /* not yet present - create a candidate for a new record and then - * redo the search */ - candidate = rxrpc_alloc_transport(local, peer, gfp); - if (!candidate) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } - - write_lock_bh(&rxrpc_transport_lock); - - list_for_each_entry(trans, &rxrpc_transports, link) { - if (trans->local == local && trans->peer == peer) - goto found_extant_second; - } - - /* we can now add the new candidate to the list */ - trans = candidate; - candidate = NULL; - usage = atomic_read(&trans->usage); - - rxrpc_get_local(trans->local); - rxrpc_get_peer(trans->peer); - list_add_tail(&trans->link, &rxrpc_transports); - write_unlock_bh(&rxrpc_transport_lock); - new = "new"; - -success: - _net("TRANSPORT %s %d local %d -> peer %d", - new, - trans->debug_id, - trans->local->debug_id, - trans->peer->debug_id); - - _leave(" = %p {u=%d}", trans, usage); - return trans; - - /* we found the transport in the list immediately */ -found_extant_transport: - usage = atomic_inc_return(&trans->usage); - read_unlock_bh(&rxrpc_transport_lock); - goto success; - - /* we found the transport on the second time through the list */ -found_extant_second: - usage = atomic_inc_return(&trans->usage); - write_unlock_bh(&rxrpc_transport_lock); - kfree(candidate); - goto success; -} - -/* - * find the transport connecting two endpoints - */ -struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *local, - struct rxrpc_peer *peer) -{ - struct rxrpc_transport *trans; - - _enter("{%pI4+%hu},{%pI4+%hu},", - &local->srx.transport.sin.sin_addr, - ntohs(local->srx.transport.sin.sin_port), - &peer->srx.transport.sin.sin_addr, - ntohs(peer->srx.transport.sin.sin_port)); - - /* search the transport list */ - read_lock_bh(&rxrpc_transport_lock); - - list_for_each_entry(trans, &rxrpc_transports, link) { - if (trans->local == local && trans->peer == peer) - goto found_extant_transport; - } - - read_unlock_bh(&rxrpc_transport_lock); - _leave(" = NULL"); - return NULL; - -found_extant_transport: - atomic_inc(&trans->usage); - read_unlock_bh(&rxrpc_transport_lock); - _leave(" = %p", trans); - return trans; -} - -/* - * release a transport session - */ -void rxrpc_put_transport(struct rxrpc_transport *trans) -{ - _enter("%p{u=%d}", trans, atomic_read(&trans->usage)); - - ASSERTCMP(atomic_read(&trans->usage), >, 0); - - trans->put_time = ktime_get_seconds(); - if (unlikely(atomic_dec_and_test(&trans->usage))) { - _debug("zombie"); - /* let the reaper determine the timeout to avoid a race with - * overextending the timeout if the reaper is running at the - * same time */ - rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); - } - _leave(""); -} - -/* - * clean up a transport session - */ -static void rxrpc_cleanup_transport(struct rxrpc_transport *trans) -{ - _net("DESTROY TRANS %d", trans->debug_id); - - rxrpc_put_local(trans->local); - rxrpc_put_peer(trans->peer); - kfree(trans); -} - -/* - * reap dead transports that have passed their expiry date - */ -static void rxrpc_transport_reaper(struct work_struct *work) -{ - struct rxrpc_transport *trans, *_p; - unsigned long now, earliest, reap_time; - - LIST_HEAD(graveyard); - - _enter(""); - - now = ktime_get_seconds(); - earliest = ULONG_MAX; - - /* extract all the transports that have been dead too long */ - write_lock_bh(&rxrpc_transport_lock); - list_for_each_entry_safe(trans, _p, &rxrpc_transports, link) { - _debug("reap TRANS %d { u=%d t=%ld }", - trans->debug_id, atomic_read(&trans->usage), - (long) now - (long) trans->put_time); - - if (likely(atomic_read(&trans->usage) > 0)) - continue; - - reap_time = trans->put_time + rxrpc_transport_expiry; - if (reap_time <= now) - list_move_tail(&trans->link, &graveyard); - else if (reap_time < earliest) - earliest = reap_time; - } - write_unlock_bh(&rxrpc_transport_lock); - - if (earliest != ULONG_MAX) { - _debug("reschedule reaper %ld", (long) earliest - now); - ASSERTCMP(earliest, >, now); - rxrpc_queue_delayed_work(&rxrpc_transport_reap, - (earliest - now) * HZ); - } - - /* then destroy all those pulled out */ - while (!list_empty(&graveyard)) { - trans = list_entry(graveyard.next, struct rxrpc_transport, - link); - list_del_init(&trans->link); - - ASSERTCMP(atomic_read(&trans->usage), ==, 0); - rxrpc_cleanup_transport(trans); - } - - _leave(""); -} - -/* - * preemptively destroy all the transport session records rather than waiting - * for them to time out - */ -void __exit rxrpc_destroy_all_transports(void) -{ - _enter(""); - - rxrpc_transport_expiry = 0; - cancel_delayed_work(&rxrpc_transport_reap); - rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); - - _leave(""); -} -- cgit v1.2.3 From b95e5928fcc76d156352570858abdea7b2628efd Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 20 Jun 2016 07:26:17 -0700 Subject: openvswitch: Add packet len info to upcall. The commit f2a4d086ed4c ("openvswitch: Add packet truncation support.") introduces packet truncation before sending to userspace upcall receiver. This patch passes up the skb->len before truncation so that the upcall receiver knows the original packet size. Potentially this will be used by sFlow, where OVS translates sFlow config header=N to a sample action, truncating packet to N byte in kernel datapath. Thus, only N bytes instead of full-packet size is copied from kernel to userspace, saving the kernel-to-userspace bandwidth. Signed-off-by: William Tu Cc: Pravin Shelar Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 673934295333..524c0fd3078e 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -387,7 +387,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ - + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ + + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */ /* OVS_PACKET_ATTR_USERDATA */ if (upcall_info->userdata) @@ -514,6 +515,16 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, pad_packet(dp, user_skb); } + /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ + if (cutlen > 0) { + if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, + skb->len)) { + err = -ENOBUFS; + goto out; + } + pad_packet(dp, user_skb); + } + /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { -- cgit v1.2.3 From 2781ff5c8fc7722e97503f96686bf6d7093069a9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Jun 2016 17:51:52 +0200 Subject: can: only call can_stat_update with procfs The change to leave out procfs support in CAN when CONFIG_PROC_FS is not set was incomplete and leads to a build error: net/built-in.o: In function `can_init': :(.init.text+0x9858): undefined reference to `can_stat_update' ERROR: "can_stat_update" [net/can/can.ko] undefined! This tries a better approach, encapsulating all of the calls within IS_ENABLED(), so we also leave out the timer function from the object file. Signed-off-by: Arnd Bergmann Fixes: a20fadf85312 ("can: build proc support only if CONFIG_PROC_FS is activated") Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 22 ++++++++++++---------- net/can/af_can.h | 11 ----------- 2 files changed, 12 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 166d436196c1..1108079d934f 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -911,14 +911,14 @@ static __init int can_init(void) if (!rcv_cache) return -ENOMEM; - if (stats_timer) { + if (IS_ENABLED(CONFIG_PROC_FS)) { + if (stats_timer) { /* the statistics are updated every second (timer triggered) */ - setup_timer(&can_stattimer, can_stat_update, 0); - mod_timer(&can_stattimer, round_jiffies(jiffies + HZ)); - } else - can_stattimer.function = NULL; - - can_init_proc(); + setup_timer(&can_stattimer, can_stat_update, 0); + mod_timer(&can_stattimer, round_jiffies(jiffies + HZ)); + } + can_init_proc(); + } /* protocol register */ sock_register(&can_family_ops); @@ -933,10 +933,12 @@ static __exit void can_exit(void) { struct net_device *dev; - if (stats_timer) - del_timer_sync(&can_stattimer); + if (IS_ENABLED(CONFIG_PROC_FS)) { + if (stats_timer) + del_timer_sync(&can_stattimer); - can_remove_proc(); + can_remove_proc(); + } /* protocol unregister */ dev_remove_pack(&canfd_packet); diff --git a/net/can/af_can.h b/net/can/af_can.h index 38a79ff20022..fca0fe9fc45a 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -113,19 +113,8 @@ struct s_pstats { extern struct dev_rcv_lists can_rx_alldev_list; /* function prototypes for the CAN networklayer procfs (proc.c) */ -#ifdef CONFIG_PROC_FS void can_init_proc(void); void can_remove_proc(void); -#else -static inline void can_init_proc(void) -{ - pr_info("can: Can't create /proc/net/can. CONFIG_PROC_FS missing!\n"); -} - -static inline void can_remove_proc(void) -{ -} -#endif void can_stat_update(unsigned long data); /* structures and variables from af_can.c needed in proc.c for reading */ -- cgit v1.2.3 From a6d0bae14858a43ab9d76d6332d7c3f2a618a6a2 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 2 Jun 2016 10:59:56 +0800 Subject: netfilter: x_tables: fix possible ZERO_SIZE_PTR pointer dereferencing error. Since we cannot make sure that the 'hook_mask' will always be none zero here. If it equals to zero, the num_hooks will be zero too, and then kmalloc() will return ZERO_SIZE_PTR, which is (void *)16. Then the following error check will fails: ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); So this patch will fix this with just doing the zero check before kmalloc() is called. Maybe the case above will never happen here, but in theory. Signed-off-by: Xiubo Li Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index c69c892231d7..8aff34e8737c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1460,6 +1460,9 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn) uint8_t hooknum; struct nf_hook_ops *ops; + if (!num_hooks) + return ERR_PTR(-EINVAL); + ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From f3bb53338e0965c3084c185020e821ac49015832 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 8 Jun 2016 20:43:17 +0800 Subject: netfilter: nf_log: handle NFPROTO_INET properly in nf_logger_[find_get|put] When we request NFPROTO_INET, it means both NFPROTO_IPV4 and NFPROTO_IPV6. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_log.c | 20 ++++++++++++++++++++ net/netfilter/nft_log.c | 21 +-------------------- 2 files changed, 21 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index a5d41dfa9f05..73b845d3cd33 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -159,6 +159,20 @@ int nf_logger_find_get(int pf, enum nf_log_type type) struct nf_logger *logger; int ret = -ENOENT; + if (pf == NFPROTO_INET) { + ret = nf_logger_find_get(NFPROTO_IPV4, type); + if (ret < 0) + return ret; + + ret = nf_logger_find_get(NFPROTO_IPV6, type); + if (ret < 0) { + nf_logger_put(NFPROTO_IPV4, type); + return ret; + } + + return 0; + } + if (rcu_access_pointer(loggers[pf][type]) == NULL) request_module("nf-logger-%u-%u", pf, type); @@ -179,6 +193,12 @@ void nf_logger_put(int pf, enum nf_log_type type) { struct nf_logger *logger; + if (pf == NFPROTO_INET) { + nf_logger_put(NFPROTO_IPV4, type); + nf_logger_put(NFPROTO_IPV6, type); + return; + } + BUG_ON(loggers[pf][type] == NULL); rcu_read_lock(); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 319c22b4bca2..713d66837705 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -52,7 +52,6 @@ static int nft_log_init(const struct nft_ctx *ctx, struct nft_log *priv = nft_expr_priv(expr); struct nf_loginfo *li = &priv->loginfo; const struct nlattr *nla; - int ret; nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { @@ -97,19 +96,6 @@ static int nft_log_init(const struct nft_ctx *ctx, break; } - if (ctx->afi->family == NFPROTO_INET) { - ret = nf_logger_find_get(NFPROTO_IPV4, li->type); - if (ret < 0) - return ret; - - ret = nf_logger_find_get(NFPROTO_IPV6, li->type); - if (ret < 0) { - nf_logger_put(NFPROTO_IPV4, li->type); - return ret; - } - return 0; - } - return nf_logger_find_get(ctx->afi->family, li->type); } @@ -122,12 +108,7 @@ static void nft_log_destroy(const struct nft_ctx *ctx, if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); - if (ctx->afi->family == NFPROTO_INET) { - nf_logger_put(NFPROTO_IPV4, li->type); - nf_logger_put(NFPROTO_IPV6, li->type); - } else { - nf_logger_put(ctx->afi->family, li->type); - } + nf_logger_put(ctx->afi->family, li->type); } static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) -- cgit v1.2.3 From 36f959c491abc7e0acf94b631a6d7a3e2e3699b0 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 8 Jun 2016 20:43:19 +0800 Subject: netfilter: xt_TRACE: add explicitly nf_logger_find_get call Consider such situation, if nf_log_ipv4 kernel module is not installed, and the user add a following iptables rule: # iptables -t raw -I PREROUTING -j TRACE There will be no trace log generated until the user install nf_log_ipv4 module manully. So we should add request related nf_log module appropriately here. Signed-off-by: Liping Zhang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_TRACE.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index df48967af382..858d189a1303 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -4,12 +4,23 @@ #include #include +#include MODULE_DESCRIPTION("Xtables: packet flow tracing"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_TRACE"); MODULE_ALIAS("ip6t_TRACE"); +static int trace_tg_check(const struct xt_tgchk_param *par) +{ + return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); +} + +static void trace_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_logger_put(par->family, NF_LOG_TYPE_LOG); +} + static unsigned int trace_tg(struct sk_buff *skb, const struct xt_action_param *par) { @@ -18,12 +29,14 @@ trace_tg(struct sk_buff *skb, const struct xt_action_param *par) } static struct xt_target trace_tg_reg __read_mostly = { - .name = "TRACE", - .revision = 0, - .family = NFPROTO_UNSPEC, - .table = "raw", - .target = trace_tg, - .me = THIS_MODULE, + .name = "TRACE", + .revision = 0, + .family = NFPROTO_UNSPEC, + .table = "raw", + .target = trace_tg, + .checkentry = trace_tg_check, + .destroy = trace_tg_destroy, + .me = THIS_MODULE, }; static int __init trace_tg_init(void) -- cgit v1.2.3 From 5a75cdebabc4576ca31f497a9272ac558421b119 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 10 Jun 2016 22:25:22 +0200 Subject: netfilter: conntrack: align nf_conn on cacheline boundary increases struct size by 32 bytes (288 -> 320), but it is the right thing, else any attempt to (re-)arrange nf_conn members by cacheline won't work. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index db2312eeb2a4..2903bb43547c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1731,7 +1731,7 @@ int nf_conntrack_init_start(void) nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), 0, - SLAB_DESTROY_BY_RCU, NULL); + SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); if (!nf_conntrack_cachep) goto err_cachep; -- cgit v1.2.3 From 7e53e7f8ca24e01292d114373f35b2999301d879 Mon Sep 17 00:00:00 2001 From: Shivani Bhardwaj Date: Sun, 12 Jun 2016 00:26:10 +0530 Subject: netfilter: nf_log: Remove NULL check If 'logger' was NULL, there would be a direct jump to the label 'out', since it has already been checked for NULL, remove this unnecessary check. Signed-off-by: Shivani Bhardwaj Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 73b845d3cd33..18e325ce6542 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -181,7 +181,7 @@ int nf_logger_find_get(int pf, enum nf_log_type type) if (logger == NULL) goto out; - if (logger && try_module_get(logger->me)) + if (try_module_get(logger->me)) ret = 0; out: rcu_read_unlock(); -- cgit v1.2.3 From 6c8dee9842461e6ee6eb46081478999b3d5cb297 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 11 Jun 2016 21:57:35 +0200 Subject: netfilter: move zone info into struct nf_conn Curently we store zone information as a conntrack extension. This has one drawback: for every lookup we need to fetch the zone data from the extension area. This change place the zone data directly into the main conntrack object structure and then removes the zone conntrack extension. The zone data is just 4 bytes, it fits into a padding hole before the tuplehash info, so we do not even increase the nf_conn structure size. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2903bb43547c..a459176c3253 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -327,16 +327,10 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); - - if (nf_ct_zone_add(tmpl, flags, zone) < 0) - goto out_free; - + nf_ct_zone_add(tmpl, zone); atomic_set(&tmpl->ct_general.use, 0); return tmpl; -out_free: - kfree(tmpl); - return NULL; } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); @@ -929,16 +923,13 @@ __nf_conntrack_alloc(struct net *net, offsetof(struct nf_conn, proto) - offsetof(struct nf_conn, __nfct_init_offset[0])); - if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0) - goto out_free; + nf_ct_zone_add(ct, zone); /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ atomic_set(&ct->ct_general.use, 0); return ct; -out_free: - kmem_cache_free(nf_conntrack_cachep, ct); out: atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); @@ -1342,14 +1333,6 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, } EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); -#ifdef CONFIG_NF_CONNTRACK_ZONES -static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { - .len = sizeof(struct nf_conntrack_zone), - .align = __alignof__(struct nf_conntrack_zone), - .id = NF_CT_EXT_ZONE, -}; -#endif - #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include @@ -1532,9 +1515,6 @@ void nf_conntrack_cleanup_end(void) nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); -#ifdef CONFIG_NF_CONNTRACK_ZONES - nf_ct_extend_unregister(&nf_ct_zone_extend); -#endif nf_conntrack_proto_fini(); nf_conntrack_seqadj_fini(); nf_conntrack_labels_fini(); @@ -1771,11 +1751,6 @@ int nf_conntrack_init_start(void) if (ret < 0) goto err_seqadj; -#ifdef CONFIG_NF_CONNTRACK_ZONES - ret = nf_ct_extend_register(&nf_ct_zone_extend); - if (ret < 0) - goto err_extend; -#endif ret = nf_conntrack_proto_init(); if (ret < 0) goto err_proto; @@ -1791,10 +1766,6 @@ int nf_conntrack_init_start(void) return 0; err_proto: -#ifdef CONFIG_NF_CONNTRACK_ZONES - nf_ct_extend_unregister(&nf_ct_zone_extend); -err_extend: -#endif nf_conntrack_seqadj_fini(); err_seqadj: nf_conntrack_labels_fini(); -- cgit v1.2.3 From 9847371a84b0be330f4bc4aaa98904101ee8573d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 14 Jun 2016 15:14:12 -0700 Subject: netfilter: Allow xt_owner in any user namespace Making this work is a little tricky as it really isn't kosher to change the xt_owner_match_info in a check function. Without changing xt_owner_match_info we need to know the user namespace the uids and gids are specified in. In the common case net->user_ns == current_user_ns(). Verify net->user_ns == current_user_ns() in owner_check so we can later assume it in owner_mt. In owner_check also verify that all of the uids and gids specified are in net->user_ns and that the expected min/max relationship exists between the uids and gids in xt_owner_match_info. In owner_mt get the network namespace from the outgoing socket, as this must be the same network namespace as the netfilter rules, and use that network namespace to find the user namespace the uids and gids in xt_match_owner_info are encoded in. Then convert from their encoded from into the kernel internal format for uids and gids and perform the owner match. Similar to ping_group_range, this code does not try to detect noncontiguous UID/GID ranges. Signed-off-by: "Eric W. Biederman" Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_owner.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 1302b475abcb..a20e731b5b6c 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -21,11 +21,39 @@ static int owner_check(const struct xt_mtchk_param *par) { struct xt_owner_match_info *info = par->matchinfo; + struct net *net = par->net; - /* For now only allow adding matches from the initial user namespace */ + /* Only allow the common case where the userns of the writer + * matches the userns of the network namespace. + */ if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) && - (current_user_ns() != &init_user_ns)) + (current_user_ns() != net->user_ns)) return -EINVAL; + + /* Ensure the uids are valid */ + if (info->match & XT_OWNER_UID) { + kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); + kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); + + if (!uid_valid(uid_min) || !uid_valid(uid_max) || + (info->uid_max < info->uid_min) || + uid_lt(uid_max, uid_min)) { + return -EINVAL; + } + } + + /* Ensure the gids are valid */ + if (info->match & XT_OWNER_GID) { + kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); + kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); + + if (!gid_valid(gid_min) || !gid_valid(gid_max) || + (info->gid_max < info->gid_min) || + gid_lt(gid_max, gid_min)) { + return -EINVAL; + } + } + return 0; } @@ -35,6 +63,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_owner_match_info *info = par->matchinfo; const struct file *filp; struct sock *sk = skb_to_full_sk(skb); + struct net *net = par->net; if (sk == NULL || sk->sk_socket == NULL) return (info->match ^ info->invert) == 0; @@ -51,8 +80,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) (XT_OWNER_UID | XT_OWNER_GID)) == 0; if (info->match & XT_OWNER_UID) { - kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min); - kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max); + kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); + kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); if ((uid_gte(filp->f_cred->fsuid, uid_min) && uid_lte(filp->f_cred->fsuid, uid_max)) ^ !(info->invert & XT_OWNER_UID)) @@ -60,8 +89,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } if (info->match & XT_OWNER_GID) { - kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min); - kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max); + kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); + kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); if ((gid_gte(filp->f_cred->fsgid, gid_min) && gid_lte(filp->f_cred->fsgid, gid_max)) ^ !(info->invert & XT_OWNER_GID)) -- cgit v1.2.3 From e1dbbc5907b53d8d53c009b3cb3dd2a0366ce45c Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 20 Jun 2016 21:26:28 +0800 Subject: netfilter: nf_reject_ipv4: don't send tcp RST if the packet is non-TCP In iptables, if the user add a rule to send tcp RST and specify the non-TCP protocol, such as UDP, kernel will reject this request. But in nftables, this validity check only occurs in nft tool, i.e. only in userspace. This means that user can add such a rule like follows via nfnetlink: "nft add rule filter forward ip protocol udp reject with tcp reset" This will generate some confusing tcp RST packets. So we should send tcp RST only when it is TCP packet. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_reject_ipv4.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index b6ea57ec5e14..fd8220213afc 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -24,6 +24,9 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) return NULL; + if (ip_hdr(oldskb)->protocol != IPPROTO_TCP) + return NULL; + oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), sizeof(struct tcphdr), _oth); if (oth == NULL) -- cgit v1.2.3 From 7643507fe8b5bd8ab7522f6a81058cc1209d2585 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Tue, 21 Jun 2016 14:58:46 -0400 Subject: netfilter: xt_NFLOG: nflog-range does not truncate packets li->u.ulog.copy_len is currently ignored by the kernel, we should truncate the packet to either li->u.ulog.copy_len (if set) or copy_range before sending it to userspace. 0 is a valid input for copy_len, so add a new flag to indicate whether this was option was specified by the user or not. Add two flags to indicate whether nflog-size/copy_len was set or not. XT_NFLOG_F_COPY_LEN is for XT_NFLOG and NFLOG_F_COPY_LEN for nfnetlink_log On the userspace side, this was initially represented by the option nflog-range, this will be replaced by --nflog-size now. --nflog-range would still exist but does not do anything. Reported-by: Joe Dollard Reviewed-by: Josh Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 9 ++++++--- net/netfilter/xt_NFLOG.c | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 11f81c8385fc..cbcfdfb586a6 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -700,10 +700,13 @@ nfulnl_log_packet(struct net *net, break; case NFULNL_COPY_PACKET: - if (inst->copy_range > skb->len) + data_len = inst->copy_range; + if ((li->u.ulog.flags & NF_LOG_F_COPY_LEN) && + (li->u.ulog.copy_len < data_len)) + data_len = li->u.ulog.copy_len; + + if (data_len > skb->len) data_len = skb->len; - else - data_len = inst->copy_range; size += nla_total_size(data_len); break; diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index a1fa2c800cb9..018eed7e1ff1 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -33,6 +33,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; + if (info->flags & XT_NFLOG_F_COPY_LEN) + li.u.ulog.flags |= NF_LOG_F_COPY_LEN; + nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in, par->out, &li, info->prefix); return XT_CONTINUE; -- cgit v1.2.3 From 889f7ee7c6e84251215d43cbc856ea116c72d3f2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 12 Jun 2016 18:07:07 +0200 Subject: netfilter: nf_tables: add generic macros to check for generation mask Thus, we can reuse these to check the genmask of any object type, not only rules. This is required now that tables, chain and sets will get a generation mask field too in follow up patches. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 46 ++++++++----------------------------------- 1 file changed, 8 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4d292b933b5c..d9f0f0797dec 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -234,42 +234,12 @@ static int nft_delchain(struct nft_ctx *ctx) return err; } -static inline bool -nft_rule_is_active(struct net *net, const struct nft_rule *rule) -{ - return (rule->genmask & nft_genmask_cur(net)) == 0; -} - -static inline int -nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) -{ - return (rule->genmask & nft_genmask_next(net)) == 0; -} - -static inline void -nft_rule_activate_next(struct net *net, struct nft_rule *rule) -{ - /* Now inactive, will be active in the future */ - rule->genmask = nft_genmask_cur(net); -} - -static inline void -nft_rule_deactivate_next(struct net *net, struct nft_rule *rule) -{ - rule->genmask = nft_genmask_next(net); -} - -static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) -{ - rule->genmask &= ~nft_genmask_next(net); -} - static int nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) { /* You cannot delete the same rule twice */ - if (nft_rule_is_active_next(ctx->net, rule)) { - nft_rule_deactivate_next(ctx->net, rule); + if (nft_is_active_next(ctx->net, rule)) { + nft_deactivate_next(ctx->net, rule); ctx->chain->use--; return 0; } @@ -1898,7 +1868,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, list_for_each_entry_rcu(table, &afi->tables, list) { list_for_each_entry_rcu(chain, &table->chains, list) { list_for_each_entry_rcu(rule, &chain->rules, list) { - if (!nft_rule_is_active(net, rule)) + if (!nft_is_active(net, rule)) goto cont; if (idx < s_idx) goto cont; @@ -2102,7 +2072,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (rule == NULL) goto err1; - nft_rule_activate_next(net, rule); + nft_activate_next(net, rule); rule->handle = handle; rule->dlen = size; @@ -2124,14 +2094,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } if (nlh->nlmsg_flags & NLM_F_REPLACE) { - if (nft_rule_is_active_next(net, old_rule)) { + if (nft_is_active_next(net, old_rule)) { trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, old_rule); if (trans == NULL) { err = -ENOMEM; goto err2; } - nft_rule_deactivate_next(net, old_rule); + nft_deactivate_next(net, old_rule); chain->use--; list_add_tail_rcu(&rule->list, &old_rule->list); } else { @@ -3980,7 +3950,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) trans->ctx.afi->nops); break; case NFT_MSG_NEWRULE: - nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_clear(trans->ctx.net, nft_trans_rule(trans)); nf_tables_rule_notify(&trans->ctx, nft_trans_rule(trans), NFT_MSG_NEWRULE); @@ -4116,7 +4086,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELRULE: trans->ctx.chain->use++; - nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: -- cgit v1.2.3 From f2a6d766765d2794e26e25655d4ffcfe29c3ec2f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 14 Jun 2016 17:29:18 +0200 Subject: netfilter: nf_tables: add generation mask to tables This patch addresses two problems: 1) The netlink dump is inconsistent when interfering with an ongoing transaction update for several reasons: 1.a) We don't honor the internal NFT_TABLE_INACTIVE flag, and we should be skipping these inactive objects in the dump. 1.b) We perform speculative deletion during the preparation phase, that may result in skipping active objects. 1.c) The listing order changes, which generates noise when tracking incremental ruleset update via tools like git or our own testsuite. 2) We don't allow to add and to update the object in the same batch, eg. add table x; add table x { flags dormant\; }. In order to resolve these problems: 1) If the user requests a deletion, the object becomes inactive in the next generation. Then, ignore objects that scheduled to be deleted from the lookup path, as they will be effectively removed in the next generation. 2) From the get/dump path, if the object is not currently active, we skip it. 3) Support 'add X -> update X' sequence from a transaction. After this update, we obtain a consistent list as long as we stay in the same generation. The userspace side can detect interferences through the generation counter so it can restart the dumping. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 101 ++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d9f0f0797dec..a4a77d60e631 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -175,9 +175,6 @@ static void nf_tables_unregister_hooks(const struct nft_table *table, nft_unregister_basechain(nft_base_chain(chain), hook_nops); } -/* Internal table flags */ -#define NFT_TABLE_INACTIVE (1 << 15) - static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) { struct nft_trans *trans; @@ -187,7 +184,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) return -ENOMEM; if (msg_type == NFT_MSG_NEWTABLE) - ctx->table->flags |= NFT_TABLE_INACTIVE; + nft_activate_next(ctx->net, ctx->table); list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -201,7 +198,7 @@ static int nft_deltable(struct nft_ctx *ctx) if (err < 0) return err; - list_del_rcu(&ctx->table->list); + nft_deactivate_next(ctx->net, ctx->table); return err; } @@ -334,26 +331,29 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) */ static struct nft_table *nft_table_lookup(const struct nft_af_info *afi, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_table *table; list_for_each_entry(table, &afi->tables, list) { - if (!nla_strcmp(nla, table->name)) + if (!nla_strcmp(nla, table->name) && + nft_active_genmask(table, genmask)) return table; } return NULL; } static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_table *table; if (nla == NULL) return ERR_PTR(-EINVAL); - table = nft_table_lookup(afi, nla); + table = nft_table_lookup(afi, nla, genmask); if (table != NULL) return table; @@ -494,6 +494,8 @@ static int nf_tables_dump_tables(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); + if (!nft_is_active(net, table)) + continue; if (nf_tables_fill_table_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -518,6 +520,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); const struct nft_af_info *afi; const struct nft_table *table; struct sk_buff *skb2; @@ -535,11 +538,9 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); + table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -648,6 +649,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); const struct nlattr *name; struct nft_af_info *afi; struct nft_table *table; @@ -661,7 +663,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, return PTR_ERR(afi); name = nla[NFTA_TABLE_NAME]; - table = nf_tables_table_lookup(afi, name); + table = nf_tables_table_lookup(afi, name, genmask); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); @@ -669,8 +671,6 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, } if (table != NULL) { - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) @@ -765,6 +765,9 @@ static int nft_flush(struct nft_ctx *ctx, int family) ctx->afi = afi; list_for_each_entry_safe(table, nt, &afi->tables, list) { + if (!nft_is_active_next(ctx->net, table)) + continue; + if (nla[NFTA_TABLE_NAME] && nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) continue; @@ -785,6 +788,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; int family = nfmsg->nfgen_family; @@ -798,7 +802,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); + table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1074,6 +1078,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; @@ -1092,11 +1097,9 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); if (IS_ERR(chain)) @@ -1201,6 +1204,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct nft_chain *chain; struct nft_base_chain *basechain = NULL; struct nlattr *ha[NFTA_HOOK_MAX + 1]; + u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; struct net_device *dev = NULL; u8 policy = NF_ACCEPT; @@ -1217,7 +1221,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1449,6 +1453,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; @@ -1459,7 +1464,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1901,6 +1906,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; @@ -1920,11 +1926,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); if (IS_ERR(chain)) @@ -1979,6 +1983,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; @@ -1999,7 +2004,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2144,6 +2149,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain = NULL; @@ -2155,7 +2161,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2309,7 +2315,8 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi = NULL; @@ -2325,7 +2332,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, if (afi == NULL) return -EAFNOSUPPORT; - table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], + genmask); if (IS_ERR(table)) return PTR_ERR(table); } @@ -2586,6 +2594,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_cur(net); const struct nft_set *set; struct nft_ctx ctx; struct sk_buff *skb2; @@ -2593,7 +2602,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; @@ -2661,6 +2670,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); const struct nft_set_ops *ops; struct nft_af_info *afi; struct nft_table *table; @@ -2758,7 +2768,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2863,6 +2873,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_set *set; struct nft_ctx ctx; int err; @@ -2872,7 +2883,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; @@ -3001,7 +3012,8 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; @@ -3011,7 +3023,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE], + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -3108,6 +3121,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + u8 genmask = nft_genmask_cur(net); const struct nft_set *set; struct nft_set_dump_args args; struct nft_ctx ctx; @@ -3124,11 +3138,9 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) return err; err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh, - (void *)nla); + (void *)nla, genmask); if (err < 0) return err; - if (ctx.table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) @@ -3187,15 +3199,14 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_cur(net); const struct nft_set *set; struct nft_ctx ctx; int err; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; - if (ctx.table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) @@ -3519,6 +3530,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_next(net); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -3527,7 +3539,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; @@ -3641,6 +3653,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_next(net); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -3649,7 +3662,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; @@ -3926,12 +3939,13 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } } else { - trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE; + nft_clear(net, trans->ctx.table); } nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); nft_trans_destroy(trans); break; case NFT_MSG_DELTABLE: + list_del_rcu(&trans->ctx.table->list); nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); break; case NFT_MSG_NEWCHAIN: @@ -4057,8 +4071,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) } break; case NFT_MSG_DELTABLE: - list_add_tail_rcu(&trans->ctx.table->list, - &trans->ctx.afi->tables); + nft_clear(trans->ctx.net, trans->ctx.table); nft_trans_destroy(trans); break; case NFT_MSG_NEWCHAIN: -- cgit v1.2.3 From 664b0f8cd8c66d02d14168ee7ac6a957cc88177f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 12 Jun 2016 19:21:31 +0200 Subject: netfilter: nf_tables: add generation mask to chains Similar to ("netfilter: nf_tables: add generation mask to tables"). Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 89 ++++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a4a77d60e631..cae88f8a990e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -211,7 +211,7 @@ static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) return -ENOMEM; if (msg_type == NFT_MSG_NEWCHAIN) - ctx->chain->flags |= NFT_CHAIN_INACTIVE; + nft_activate_next(ctx->net, ctx->chain); list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -226,7 +226,7 @@ static int nft_delchain(struct nft_ctx *ctx) return err; ctx->table->use--; - list_del_rcu(&ctx->chain->list); + nft_deactivate_next(ctx->net, ctx->chain); return err; } @@ -559,13 +559,16 @@ err: return err; } -static int nf_tables_table_enable(const struct nft_af_info *afi, +static int nf_tables_table_enable(struct net *net, + const struct nft_af_info *afi, struct nft_table *table) { struct nft_chain *chain; int err, i = 0; list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; if (!(chain->flags & NFT_BASE_CHAIN)) continue; @@ -578,6 +581,8 @@ static int nf_tables_table_enable(const struct nft_af_info *afi, return 0; err: list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; if (!(chain->flags & NFT_BASE_CHAIN)) continue; @@ -589,12 +594,15 @@ err: return err; } -static void nf_tables_table_disable(const struct nft_af_info *afi, +static void nf_tables_table_disable(struct net *net, + const struct nft_af_info *afi, struct nft_table *table) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; if (chain->flags & NFT_BASE_CHAIN) nft_unregister_basechain(nft_base_chain(chain), afi->nops); @@ -627,7 +635,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) nft_trans_table_enable(trans) = false; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { - ret = nf_tables_table_enable(ctx->afi, ctx->table); + ret = nf_tables_table_enable(ctx->net, ctx->afi, ctx->table); if (ret >= 0) { ctx->table->flags &= ~NFT_TABLE_F_DORMANT; nft_trans_table_enable(trans) = true; @@ -722,6 +730,9 @@ static int nft_flush_table(struct nft_ctx *ctx) struct nft_set *set, *ns; list_for_each_entry(chain, &ctx->table->chains, list) { + if (!nft_is_active_next(ctx->net, chain)) + continue; + ctx->chain = chain; err = nft_delrule_by_chain(ctx); @@ -740,6 +751,9 @@ static int nft_flush_table(struct nft_ctx *ctx) } list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) { + if (!nft_is_active_next(ctx->net, chain)) + continue; + ctx->chain = chain; err = nft_delchain(ctx); @@ -849,12 +863,14 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type); */ static struct nft_chain * -nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle) +nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle, + u8 genmask) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { - if (chain->handle == handle) + if (chain->handle == handle && + nft_active_genmask(chain, genmask)) return chain; } @@ -862,7 +878,8 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle) } static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_chain *chain; @@ -870,7 +887,8 @@ static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, return ERR_PTR(-EINVAL); list_for_each_entry(chain, &table->chains, list) { - if (!nla_strcmp(nla, chain->name)) + if (!nla_strcmp(nla, chain->name) && + nft_active_genmask(chain, genmask)) return chain; } @@ -1053,6 +1071,8 @@ static int nf_tables_dump_chains(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); + if (!nft_is_active(net, chain)) + continue; if (nf_tables_fill_chain_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -1101,11 +1121,9 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); - if (chain->flags & NFT_CHAIN_INACTIVE) - return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -1230,11 +1248,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nla[NFTA_CHAIN_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); - chain = nf_tables_chain_lookup_byhandle(table, handle); + chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); if (IS_ERR(chain)) return PTR_ERR(chain); } else { - chain = nf_tables_chain_lookup(table, name); + chain = nf_tables_chain_lookup(table, name, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) return PTR_ERR(chain); @@ -1265,16 +1283,20 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct nft_stats *stats = NULL; struct nft_trans *trans; - if (chain->flags & NFT_CHAIN_INACTIVE) - return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - if (nla[NFTA_CHAIN_HANDLE] && name && - !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]))) - return -EEXIST; + if (nla[NFTA_CHAIN_HANDLE] && name) { + struct nft_chain *chain2; + + chain2 = nf_tables_chain_lookup(table, + nla[NFTA_CHAIN_NAME], + genmask); + if (IS_ERR(chain2)) + return PTR_ERR(chain2); + } if (nla[NFTA_CHAIN_COUNTERS]) { if (!(chain->flags & NFT_BASE_CHAIN)) @@ -1468,7 +1490,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (chain->use > 0) @@ -1930,11 +1952,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); - if (chain->flags & NFT_CHAIN_INACTIVE) - return -ENOENT; rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) @@ -2008,7 +2028,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); @@ -2166,7 +2186,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); if (nla[NFTA_RULE_CHAIN]) { - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], + genmask); if (IS_ERR(chain)) return PTR_ERR(chain); } @@ -2186,6 +2207,9 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, } } else { list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; + ctx.chain = chain; err = nft_delrule_by_chain(&ctx); if (err < 0) @@ -3934,7 +3958,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { if (!nft_trans_table_enable(trans)) { - nf_tables_table_disable(trans->ctx.afi, + nf_tables_table_disable(net, + trans->ctx.afi, trans->ctx.table); trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } @@ -3952,12 +3977,13 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_chain_update(trans)) nft_chain_commit_update(trans); else - trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE; + nft_clear(net, trans->ctx.chain); nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); nft_trans_destroy(trans); break; case NFT_MSG_DELCHAIN: + list_del_rcu(&trans->ctx.chain->list); nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); nf_tables_unregister_hooks(trans->ctx.table, trans->ctx.chain, @@ -4061,7 +4087,8 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { if (nft_trans_table_enable(trans)) { - nf_tables_table_disable(trans->ctx.afi, + nf_tables_table_disable(net, + trans->ctx.afi, trans->ctx.table); trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } @@ -4089,8 +4116,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELCHAIN: trans->ctx.table->use++; - list_add_tail_rcu(&trans->ctx.chain->list, - &trans->ctx.table->chains); + nft_clear(trans->ctx.net, trans->ctx.chain); nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: @@ -4413,6 +4439,7 @@ static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla) { + u8 genmask = nft_genmask_next(ctx->net); struct nlattr *tb[NFTA_VERDICT_MAX + 1]; struct nft_chain *chain; int err; @@ -4445,7 +4472,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (!tb[NFTA_VERDICT_CHAIN]) return -EINVAL; chain = nf_tables_chain_lookup(ctx->table, - tb[NFTA_VERDICT_CHAIN]); + tb[NFTA_VERDICT_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (chain->flags & NFT_BASE_CHAIN) -- cgit v1.2.3 From 37a9cc52552579f22e18cca401cfc4351b6cbc72 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 12 Jun 2016 22:52:45 +0200 Subject: netfilter: nf_tables: add generation mask to sets Similar to ("netfilter: nf_tables: add generation mask to tables"). Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 68 +++++++++++++++++++++++++------------------ net/netfilter/nft_dynset.c | 7 +++-- net/netfilter/nft_lookup.c | 6 ++-- 3 files changed, 49 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cae88f8a990e..3316bce0a878 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -289,9 +289,6 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx) return 0; } -/* Internal set flag */ -#define NFT_SET_INACTIVE (1 << 15) - static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, struct nft_set *set) { @@ -304,7 +301,7 @@ static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { nft_trans_set_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); - set->flags |= NFT_SET_INACTIVE; + nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -320,7 +317,7 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) if (err < 0) return err; - list_del_rcu(&set->list); + nft_deactivate_next(ctx->net, set); ctx->table->use--; return err; @@ -741,6 +738,9 @@ static int nft_flush_table(struct nft_ctx *ctx) } list_for_each_entry_safe(set, ns, &ctx->table->sets, list) { + if (!nft_is_active_next(ctx->net, set)) + continue; + if (set->flags & NFT_SET_ANONYMOUS && !list_empty(&set->bindings)) continue; @@ -2367,7 +2367,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, } struct nft_set *nf_tables_set_lookup(const struct nft_table *table, - const struct nlattr *nla) + const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -2375,22 +2375,27 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-EINVAL); list_for_each_entry(set, &table->sets, list) { - if (!nla_strcmp(nla, set->name)) + if (!nla_strcmp(nla, set->name) && + nft_active_genmask(set, genmask)) return set; } return ERR_PTR(-ENOENT); } struct nft_set *nf_tables_set_lookup_byid(const struct net *net, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_trans *trans; u32 id = ntohl(nla_get_be32(nla)); list_for_each_entry(trans, &net->nft.commit_list, list) { + struct nft_set *set = nft_trans_set(trans); + if (trans->msg_type == NFT_MSG_NEWSET && - id == nft_trans_set_id(trans)) - return nft_trans_set(trans); + id == nft_trans_set_id(trans) && + nft_active_genmask(set, genmask)) + return set; } return ERR_PTR(-ENOENT); } @@ -2415,6 +2420,8 @@ cont: list_for_each_entry(i, &ctx->table->sets, list) { int tmp; + if (!nft_is_active_next(ctx->net, set)) + continue; if (!sscanf(i->name, name, &tmp)) continue; if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) @@ -2434,6 +2441,8 @@ cont: snprintf(set->name, sizeof(set->name), name, min + n); list_for_each_entry(i, &ctx->table->sets, list) { + if (!nft_is_active_next(ctx->net, i)) + continue; if (!strcmp(set->name, i->name)) return -ENFILE; } @@ -2582,6 +2591,8 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) list_for_each_entry_rcu(set, &table->sets, list) { if (idx < s_idx) goto cont; + if (!nft_is_active(net, set)) + goto cont; ctx_set = *ctx; ctx_set.table = table; @@ -2651,11 +2662,9 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, if (nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_INACTIVE) - return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb2 == NULL) @@ -2798,7 +2807,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); - set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); + set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) return PTR_ERR(set); @@ -2911,7 +2920,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings)) @@ -2980,7 +2989,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, list_del_rcu(&binding->list); if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && - !(set->flags & NFT_SET_INACTIVE)) + nft_is_active(ctx->net, set)) nf_tables_set_destroy(ctx, set); } @@ -3166,11 +3175,10 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_INACTIVE) - return -ENOENT; event = NFT_MSG_NEWSETELEM; event |= NFNL_SUBSYS_NFTABLES << 8; @@ -3232,11 +3240,10 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_INACTIVE) - return -ENOENT; if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -3567,11 +3574,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) { if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { set = nf_tables_set_lookup_byid(net, - nla[NFTA_SET_ELEM_LIST_SET_ID]); + nla[NFTA_SET_ELEM_LIST_SET_ID], + genmask); } if (IS_ERR(set)) return PTR_ERR(set); @@ -3690,7 +3699,8 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) @@ -4003,7 +4013,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) NFT_MSG_DELRULE); break; case NFT_MSG_NEWSET: - nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE; + nft_clear(net, nft_trans_set(trans)); /* This avoids hitting -EBUSY when deleting the table * from the transaction. */ @@ -4016,6 +4026,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_DELSET, GFP_KERNEL); break; @@ -4134,8 +4145,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELSET: trans->ctx.table->use++; - list_add_tail_rcu(&nft_trans_set(trans)->list, - &trans->ctx.table->sets); + nft_clear(trans->ctx.net, nft_trans_set(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: @@ -4282,6 +4292,8 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, } list_for_each_entry(set, &ctx->table->sets, list) { + if (!nft_is_active_next(ctx->net, set)) + continue; if (!(set->flags & NFT_SET_MAP) || set->dtype != NFT_DATA_VERDICT) continue; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 78d4914fb39c..0af26699bf04 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -103,6 +103,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_dynset *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u64 timeout; int err; @@ -112,11 +113,13 @@ static int nft_dynset_init(const struct nft_ctx *ctx, tb[NFTA_DYNSET_SREG_KEY] == NULL) return -EINVAL; - set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME]); + set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME], + genmask); if (IS_ERR(set)) { if (tb[NFTA_DYNSET_SET_ID]) set = nf_tables_set_lookup_byid(ctx->net, - tb[NFTA_DYNSET_SET_ID]); + tb[NFTA_DYNSET_SET_ID], + genmask); if (IS_ERR(set)) return PTR_ERR(set); } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index b3c31ef8015d..8a102cf855d0 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -54,6 +54,7 @@ static int nft_lookup_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_lookup *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; int err; @@ -61,11 +62,12 @@ static int nft_lookup_init(const struct nft_ctx *ctx, tb[NFTA_LOOKUP_SREG] == NULL) return -EINVAL; - set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]); + set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET], genmask); if (IS_ERR(set)) { if (tb[NFTA_LOOKUP_SET_ID]) { set = nf_tables_set_lookup_byid(ctx->net, - tb[NFTA_LOOKUP_SET_ID]); + tb[NFTA_LOOKUP_SET_ID], + genmask); } if (IS_ERR(set)) return PTR_ERR(set); -- cgit v1.2.3 From 4e5001651f5e488eac378ebabc5bde2a8f1ea861 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 21 Jun 2016 00:12:15 +0200 Subject: netfilter: nft_rbtree: check for next generation when deactivating elements set->ops->deactivate() is invoked from nft_del_setelem() that happens from the transaction path, so we have to check if the object is active in the next generation, not the current. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_rbtree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index f762094af7c1..86fbe5e68d28 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -170,7 +170,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; struct nft_rbtree_elem *rbe, *this = elem->priv; - u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); + u8 genmask = nft_genmask_next(read_pnet(&set->pnet)); int d; while (parent != NULL) { -- cgit v1.2.3 From 8eee54be73f4b938dbf48e95c0dbecb5f19b08ee Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 21 Jun 2016 00:12:26 +0200 Subject: netfilter: nft_hash: support deletion of inactive elements New elements are inactive in the preparation phase, and its NFT_SET_ELEM_BUSY_MASK flag is set on. This busy flag doesn't allow us to delete it from the same transaction, following a sequence like: begin transaction add element X delete element X end transaction This sequence is valid and may be triggered by robots. To resolve this problem, allow deactivating elements that are active in the current generation (ie. those that has been just added in this batch). Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_hash.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 6fa016564f90..d3a507d3f192 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -153,9 +153,10 @@ static void *nft_hash_deactivate(const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->pnet); struct nft_hash_elem *he; struct nft_hash_cmp_arg arg = { - .genmask = nft_genmask_next(read_pnet(&set->pnet)), + .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, }; @@ -163,7 +164,8 @@ static void *nft_hash_deactivate(const struct nft_set *set, rcu_read_lock(); he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); if (he != NULL) { - if (!nft_set_elem_mark_busy(&he->ext)) + if (!nft_set_elem_mark_busy(&he->ext) || + !nft_is_active(net, &he->ext)) nft_set_elem_change_active(set, &he->ext); else he = NULL; -- cgit v1.2.3 From 3183ab8997a477c8d9ad175a1cef70dff77c6dbc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 22 Jun 2016 13:26:10 +0200 Subject: netfilter: conntrack: allow increasing bucket size via sysctl too No need to restrict this to module parameter. We export a copy of the real hash size -- when user alters the value we allocate the new table, copy entries etc before we update the real size to the requested one. This is also needed because the real size is used by concurrent readers and cannot be changed without synchronizing the conntrack generation seqcnt. We only allow changing this value from the initial net namespace. Tested using http-client-benchmark vs. httpterm with concurrent while true;do echo $RANDOM > /proc/sys/net/netfilter/nf_conntrack_buckets done Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 41 ++++++++++++++++++++++----------- net/netfilter/nf_conntrack_standalone.c | 36 +++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a459176c3253..e17d5c7faca0 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1595,24 +1595,14 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) } EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); -int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +int nf_conntrack_hash_resize(unsigned int hashsize) { - int i, bucket, rc; - unsigned int hashsize, old_size; + int i, bucket; + unsigned int old_size; struct hlist_nulls_head *hash, *old_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - if (current->nsproxy->net_ns != &init_net) - return -EOPNOTSUPP; - - /* On boot, we can set this without any fancy locking. */ - if (!nf_conntrack_htable_size) - return param_set_uint(val, kp); - - rc = kstrtouint(val, 0, &hashsize); - if (rc) - return rc; if (!hashsize) return -EINVAL; @@ -1620,6 +1610,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) if (!hash) return -ENOMEM; + old_size = nf_conntrack_htable_size; + if (old_size == hashsize) { + nf_ct_free_hashtable(hash, hashsize); + return 0; + } + local_bh_disable(); nf_conntrack_all_lock(); write_seqcount_begin(&nf_conntrack_generation); @@ -1655,6 +1651,25 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) nf_ct_free_hashtable(old_hash, old_size); return 0; } + +int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +{ + unsigned int hashsize; + int rc; + + if (current->nsproxy->net_ns != &init_net) + return -EOPNOTSUPP; + + /* On boot, we can set this without any fancy locking. */ + if (!nf_conntrack_htable_size) + return param_set_uint(val, kp); + + rc = kstrtouint(val, 0, &hashsize); + if (rc) + return rc; + + return nf_conntrack_hash_resize(hashsize); +} EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index f87e84ebcec3..a0cc1919f081 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -434,8 +434,29 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) #ifdef CONFIG_SYSCTL /* Log invalid packets of a given protocol */ -static int log_invalid_proto_min = 0; -static int log_invalid_proto_max = 255; +static int log_invalid_proto_min __read_mostly; +static int log_invalid_proto_max __read_mostly = 255; + +/* size the user *wants to set */ +static unsigned int nf_conntrack_htable_size_user __read_mostly; + +static int +nf_conntrack_hash_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret < 0 || !write) + return ret; + + /* update ret, we might not be able to satisfy request */ + ret = nf_conntrack_hash_resize(nf_conntrack_htable_size_user); + + /* update it to the actual value used by conntrack */ + nf_conntrack_htable_size_user = nf_conntrack_htable_size; + return ret; +} static struct ctl_table_header *nf_ct_netfilter_header; @@ -456,10 +477,10 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, { .procname = "nf_conntrack_buckets", - .data = &nf_conntrack_htable_size, + .data = &nf_conntrack_htable_size_user, .maxlen = sizeof(unsigned int), - .mode = 0444, - .proc_handler = proc_dointvec, + .mode = 0644, + .proc_handler = nf_conntrack_hash_sysctl, }, { .procname = "nf_conntrack_checksum", @@ -517,6 +538,9 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) if (net->user_ns != &init_user_ns) table[0].procname = NULL; + if (!net_eq(&init_net, net)) + table[2].mode = 0444; + net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.sysctl_header) goto out_unregister_netfilter; @@ -606,6 +630,8 @@ static int __init nf_conntrack_standalone_init(void) ret = -ENOMEM; goto out_sysctl; } + + nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif ret = register_pernet_subsys(&nf_conntrack_net_ops); -- cgit v1.2.3 From 82bec71d46b83f39860e2838ff8394e4fcd6efab Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 22 Jun 2016 14:26:33 +0200 Subject: netfilter: nf_tables: get rid of NFT_BASECHAIN_DISABLED This flag was introduced to restore rulesets from the new netdev family, but since 5ebe0b0eec9d6f7 ("netfilter: nf_tables: destroy basechain and rules on netdevice removal") the ruleset is released once the netdev is gone. This also removes nft_register_basechain() and nft_unregister_basechain() since they have no clients anymore after this rework. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 62 +++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3316bce0a878..92c9faeb2bf8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -131,29 +131,8 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } -static int nft_register_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops) -{ - struct net *net = read_pnet(&basechain->pnet); - - if (basechain->flags & NFT_BASECHAIN_DISABLED) - return 0; - - return nf_register_net_hooks(net, basechain->ops, hook_nops); -} - -static void nft_unregister_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops) -{ - struct net *net = read_pnet(&basechain->pnet); - - if (basechain->flags & NFT_BASECHAIN_DISABLED) - return; - - nf_unregister_net_hooks(net, basechain->ops, hook_nops); -} - -static int nf_tables_register_hooks(const struct nft_table *table, +static int nf_tables_register_hooks(struct net *net, + const struct nft_table *table, struct nft_chain *chain, unsigned int hook_nops) { @@ -161,10 +140,12 @@ static int nf_tables_register_hooks(const struct nft_table *table, !(chain->flags & NFT_BASE_CHAIN)) return 0; - return nft_register_basechain(nft_base_chain(chain), hook_nops); + return nf_register_net_hooks(net, nft_base_chain(chain)->ops, + hook_nops); } -static void nf_tables_unregister_hooks(const struct nft_table *table, +static void nf_tables_unregister_hooks(struct net *net, + const struct nft_table *table, struct nft_chain *chain, unsigned int hook_nops) { @@ -172,7 +153,7 @@ static void nf_tables_unregister_hooks(const struct nft_table *table, !(chain->flags & NFT_BASE_CHAIN)) return; - nft_unregister_basechain(nft_base_chain(chain), hook_nops); + nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, hook_nops); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -569,7 +550,8 @@ static int nf_tables_table_enable(struct net *net, if (!(chain->flags & NFT_BASE_CHAIN)) continue; - err = nft_register_basechain(nft_base_chain(chain), afi->nops); + err = nf_register_net_hooks(net, nft_base_chain(chain)->ops, + afi->nops); if (err < 0) goto err; @@ -586,7 +568,8 @@ err: if (i-- <= 0) break; - nft_unregister_basechain(nft_base_chain(chain), afi->nops); + nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, + afi->nops); } return err; } @@ -600,9 +583,11 @@ static void nf_tables_table_disable(struct net *net, list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; - if (chain->flags & NFT_BASE_CHAIN) - nft_unregister_basechain(nft_base_chain(chain), - afi->nops); + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, + afi->nops); } } @@ -1451,7 +1436,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, chain->table = table; nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); - err = nf_tables_register_hooks(table, chain, afi->nops); + err = nf_tables_register_hooks(net, table, chain, afi->nops); if (err < 0) goto err1; @@ -1464,7 +1449,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, list_add_tail_rcu(&chain->list, &table->chains); return 0; err2: - nf_tables_unregister_hooks(table, chain, afi->nops); + nf_tables_unregister_hooks(net, table, chain, afi->nops); err1: nf_tables_chain_destroy(chain); return err; @@ -3995,7 +3980,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_DELCHAIN: list_del_rcu(&trans->ctx.chain->list); nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); - nf_tables_unregister_hooks(trans->ctx.table, + nf_tables_unregister_hooks(trans->ctx.net, + trans->ctx.table, trans->ctx.chain, trans->ctx.afi->nops); break; @@ -4120,7 +4106,8 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) } else { trans->ctx.table->use--; list_del_rcu(&trans->ctx.chain->list); - nf_tables_unregister_hooks(trans->ctx.table, + nf_tables_unregister_hooks(trans->ctx.net, + trans->ctx.table, trans->ctx.chain, trans->ctx.afi->nops); } @@ -4662,7 +4649,7 @@ int __nft_release_basechain(struct nft_ctx *ctx) BUG_ON(!(ctx->chain->flags & NFT_BASE_CHAIN)); - nf_tables_unregister_hooks(ctx->chain->table, ctx->chain, + nf_tables_unregister_hooks(ctx->net, ctx->chain->table, ctx->chain, ctx->afi->nops); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { list_del(&rule->list); @@ -4691,7 +4678,8 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) list_for_each_entry_safe(table, nt, &afi->tables, list) { list_for_each_entry(chain, &table->chains, list) - nf_tables_unregister_hooks(table, chain, afi->nops); + nf_tables_unregister_hooks(net, table, chain, + afi->nops); /* No packets are walking on these chains anymore. */ ctx.table = table; list_for_each_entry(chain, &table->chains, list) { -- cgit v1.2.3 From 0071e184a535e40ce487528cb04f4690cb0da881 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Thu, 23 Jun 2016 12:24:08 +0200 Subject: netfilter: nf_tables: add support for inverted logic in nft_lookup Introduce a new configuration option for this expression, which allows users to invert the logic of set lookups. In _init() we will now return EINVAL if NFT_LOOKUP_F_INV is in anyway related to a map lookup. The code in the _eval() function has been untangled and updated to sopport the XOR of options, as we should consider 4 cases: * lookup false, invert false -> NFT_BREAK * lookup false, invert true -> return w/o NFT_BREAK * lookup true, invert false -> return w/o NFT_BREAK * lookup true, invert true -> NFT_BREAK Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_lookup.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 8a102cf855d0..b8d18f598569 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -22,6 +22,7 @@ struct nft_lookup { struct nft_set *set; enum nft_registers sreg:8; enum nft_registers dreg:8; + bool invert; struct nft_set_binding binding; }; @@ -32,14 +33,20 @@ static void nft_lookup_eval(const struct nft_expr *expr, const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; const struct nft_set_ext *ext; + bool found; - if (set->ops->lookup(set, ®s->data[priv->sreg], &ext)) { - if (set->flags & NFT_SET_MAP) - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), set->dlen); + found = set->ops->lookup(set, ®s->data[priv->sreg], &ext) ^ + priv->invert; + + if (!found) { + regs->verdict.code = NFT_BREAK; return; } - regs->verdict.code = NFT_BREAK; + + if (found && set->flags & NFT_SET_MAP) + nft_data_copy(®s->data[priv->dreg], + nft_set_ext_data(ext), set->dlen); + } static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { @@ -47,6 +54,7 @@ static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, + [NFTA_LOOKUP_FLAGS] = { .type = NLA_U32 }, }; static int nft_lookup_init(const struct nft_ctx *ctx, @@ -56,6 +64,7 @@ static int nft_lookup_init(const struct nft_ctx *ctx, struct nft_lookup *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; + u32 flags; int err; if (tb[NFTA_LOOKUP_SET] == NULL || @@ -81,7 +90,22 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (err < 0) return err; + if (tb[NFTA_LOOKUP_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS])); + + if (flags & ~NFT_LOOKUP_F_INV) + return -EINVAL; + + if (flags & NFT_LOOKUP_F_INV) { + if (set->flags & NFT_SET_MAP) + return -EINVAL; + priv->invert = true; + } + } + if (tb[NFTA_LOOKUP_DREG] != NULL) { + if (priv->invert) + return -EINVAL; if (!(set->flags & NFT_SET_MAP)) return -EINVAL; @@ -114,6 +138,7 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx, static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); + u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0; if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name)) goto nla_put_failure; @@ -122,6 +147,8 @@ static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) if (priv->set->flags & NFT_SET_MAP) if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg)) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags))) + goto nla_put_failure; return 0; nla_put_failure: -- cgit v1.2.3 From 1c1779fa54b2a9d4e1de990095d790d64b9e00a1 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Tue, 21 Jun 2016 14:59:37 -0700 Subject: openvswitch: Set mark and labels before confirming. Set conntrack mark and labels right before committing so that the initial conntrack NEW event has the mark and labels. Signed-off-by: Jarno Rajahalme Acked-by: Joe Stringer Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 3d5feede962d..23fd4fbd11e2 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -824,23 +824,6 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, return 0; } -/* Lookup connection and confirm if unconfirmed. */ -static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, - const struct ovs_conntrack_info *info, - struct sk_buff *skb) -{ - int err; - - err = __ovs_ct_lookup(net, key, info, skb); - if (err) - return err; - /* This is a no-op if the connection has already been confirmed. */ - if (nf_conntrack_confirm(skb) != NF_ACCEPT) - return -EINVAL; - - return 0; -} - static bool labels_nonzero(const struct ovs_key_ct_labels *labels) { size_t i; @@ -873,21 +856,33 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, } if (info->commit) - err = ovs_ct_commit(net, key, info, skb); + err = __ovs_ct_lookup(net, key, info, skb); else err = ovs_ct_lookup(net, key, info, skb); if (err) goto err; + /* Apply changes before confirming the connection so that the initial + * conntrack NEW netlink event carries the values given in the CT + * action. + */ if (info->mark.mask) { err = ovs_ct_set_mark(skb, key, info->mark.value, info->mark.mask); if (err) goto err; } - if (labels_nonzero(&info->labels.mask)) + if (labels_nonzero(&info->labels.mask)) { err = ovs_ct_set_labels(skb, key, &info->labels.value, &info->labels.mask); + if (err) + goto err; + } + /* This will take care of sending queued events even if the connection + * is already confirmed. + */ + if (info->commit && nf_conntrack_confirm(skb) != NF_ACCEPT) + err = -EINVAL; err: skb_push(skb, nh_ofs); if (err) -- cgit v1.2.3 From 7d904c7bcd51f72579c0c3134a50896c5a3efb9f Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Tue, 21 Jun 2016 14:59:38 -0700 Subject: openvswitch: Only set mark and labels with a commit flag. Only set conntrack mark or labels when the commit flag is specified. This makes sure we can not set them before the connection has been persisted, as in that case the mark and labels would be lost in an event of an userspace upcall. OVS userspace already requires the commit flag to accept setting ct_mark and/or ct_labels. Validate for this in the kernel API. Signed-off-by: Jarno Rajahalme Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 76 ++++++++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 23fd4fbd11e2..52f3b9b89e97 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -835,6 +835,42 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels) return false; } +/* Lookup connection and confirm if unconfirmed. */ +static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + int err; + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + + /* Apply changes before confirming the connection so that the initial + * conntrack NEW netlink event carries the values given in the CT + * action. + */ + if (info->mark.mask) { + err = ovs_ct_set_mark(skb, key, info->mark.value, + info->mark.mask); + if (err) + return err; + } + if (labels_nonzero(&info->labels.mask)) { + err = ovs_ct_set_labels(skb, key, &info->labels.value, + &info->labels.mask); + if (err) + return err; + } + /* This will take care of sending queued events even if the connection + * is already confirmed. + */ + if (nf_conntrack_confirm(skb) != NF_ACCEPT) + return -EINVAL; + + return 0; +} + /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero * value if 'skb' is freed. */ @@ -856,34 +892,10 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, } if (info->commit) - err = __ovs_ct_lookup(net, key, info, skb); + err = ovs_ct_commit(net, key, info, skb); else err = ovs_ct_lookup(net, key, info, skb); - if (err) - goto err; - /* Apply changes before confirming the connection so that the initial - * conntrack NEW netlink event carries the values given in the CT - * action. - */ - if (info->mark.mask) { - err = ovs_ct_set_mark(skb, key, info->mark.value, - info->mark.mask); - if (err) - goto err; - } - if (labels_nonzero(&info->labels.mask)) { - err = ovs_ct_set_labels(skb, key, &info->labels.value, - &info->labels.mask); - if (err) - goto err; - } - /* This will take care of sending queued events even if the connection - * is already confirmed. - */ - if (info->commit && nf_conntrack_confirm(skb) != NF_ACCEPT) - err = -EINVAL; -err: skb_push(skb, nh_ofs); if (err) kfree_skb(skb); @@ -1140,6 +1152,20 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, } } +#ifdef CONFIG_NF_CONNTRACK_MARK + if (!info->commit && info->mark.mask) { + OVS_NLERR(log, + "Setting conntrack mark requires 'commit' flag."); + return -EINVAL; + } +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + if (!info->commit && labels_nonzero(&info->labels.mask)) { + OVS_NLERR(log, + "Setting conntrack labels requires 'commit' flag."); + return -EINVAL; + } +#endif if (rem > 0) { OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); return -EINVAL; -- cgit v1.2.3 From 520ac30f45519b0a82dd92117c181d1d6144677b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Jun 2016 23:16:49 -0700 Subject: net_sched: drop packets after root qdisc lock is released Qdisc performance suffers when packets are dropped at enqueue() time because drops (kfree_skb()) are done while qdisc lock is held, delaying a dequeue() draining the queue. Nominal throughput can be reduced by 50 % when this happens, at a time we would like the dequeue() to proceed as fast as possible. Even FQ is vulnerable to this problem, while one of FQ goals was to provide some flow isolation. This patch adds a 'struct sk_buff **to_free' parameter to all qdisc->enqueue(), and in qdisc_drop() helper. I measured a performance increase of up to 12 %, but this patch is a prereq so that future batches in enqueue() can fly. Signed-off-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++-- net/sched/sch_atm.c | 9 +++++---- net/sched/sch_blackhole.c | 5 +++-- net/sched/sch_cbq.c | 7 ++++--- net/sched/sch_choke.c | 16 +++++++++------- net/sched/sch_codel.c | 8 +++++--- net/sched/sch_drr.c | 7 ++++--- net/sched/sch_dsmark.c | 9 +++++---- net/sched/sch_fifo.c | 15 +++++++++------ net/sched/sch_fq.c | 7 ++++--- net/sched/sch_fq_codel.c | 15 +++++++++------ net/sched/sch_generic.c | 10 ++++++---- net/sched/sch_gred.c | 7 ++++--- net/sched/sch_hfsc.c | 6 +++--- net/sched/sch_hhf.c | 10 +++++----- net/sched/sch_htb.c | 10 ++++++---- net/sched/sch_multiq.c | 7 ++++--- net/sched/sch_netem.c | 25 +++++++++++++++---------- net/sched/sch_pie.c | 5 +++-- net/sched/sch_plug.c | 5 +++-- net/sched/sch_prio.c | 4 ++-- net/sched/sch_qfq.c | 7 ++++--- net/sched/sch_red.c | 7 ++++--- net/sched/sch_sfb.c | 7 ++++--- net/sched/sch_sfq.c | 8 ++++---- net/sched/sch_tbf.c | 16 +++++++++------- net/sched/sch_teql.c | 4 ++-- 27 files changed, 140 insertions(+), 103 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index d40593b3b9fb..aba10d2a8bc3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3070,6 +3070,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); + struct sk_buff *to_free = NULL; bool contended; int rc; @@ -3086,7 +3087,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, spin_lock(root_lock); if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - kfree_skb(skb); + __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && qdisc_run_begin(q)) { @@ -3109,7 +3110,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { - rc = q->enqueue(skb, q) & NET_XMIT_MASK; + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -3119,6 +3120,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, } } spin_unlock(root_lock); + if (unlikely(to_free)) + kfree_skb_list(to_free); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index e04ea6994d1c..481e4f12aeb4 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -357,7 +357,8 @@ static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch, /* --------------------------- Qdisc operations ---------------------------- */ -static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; @@ -398,10 +399,10 @@ done: switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: - kfree_skb(skb); + __qdisc_drop(skb, to_free); goto drop; case TC_ACT_RECLASSIFY: if (flow->excess) @@ -413,7 +414,7 @@ done: #endif } - ret = qdisc_enqueue(skb, flow->q); + ret = qdisc_enqueue(skb, flow->q, to_free); if (ret != NET_XMIT_SUCCESS) { drop: __maybe_unused if (net_xmit_drop_count(ret)) { diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c index 3fee70d9814f..c98a61e980ba 100644 --- a/net/sched/sch_blackhole.c +++ b/net/sched/sch_blackhole.c @@ -17,9 +17,10 @@ #include #include -static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_SUCCESS; } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index a29fd811d7b9..beb554aa8cfb 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -358,7 +358,8 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) } static int -cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct cbq_sched_data *q = qdisc_priv(sch); int uninitialized_var(ret); @@ -370,11 +371,11 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl == NULL) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } - ret = qdisc_enqueue(skb, cl->q); + ret = qdisc_enqueue(skb, cl->q, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; cbq_mark_toplevel(q, cl); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 789b69ee9e51..3b6d5bd69101 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -115,7 +115,8 @@ static void choke_zap_tail_holes(struct choke_sched_data *q) } /* Drop packet from queue array by creating a "hole" */ -static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) +static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx, + struct sk_buff **to_free) { struct choke_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = q->tab[idx]; @@ -129,7 +130,7 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) qdisc_qstats_backlog_dec(sch, skb); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); --sch->q.qlen; } @@ -261,7 +262,8 @@ static bool choke_match_random(const struct choke_sched_data *q, return choke_match_flow(oskb, nskb); } -static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; struct choke_sched_data *q = qdisc_priv(sch); @@ -288,7 +290,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* Draw a packet at random from queue and compare flow */ if (choke_match_random(q, skb, &idx)) { q->stats.matched++; - choke_drop_by_idx(sch, idx); + choke_drop_by_idx(sch, idx, to_free); goto congestion_drop; } @@ -331,16 +333,16 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) } q->stats.pdrop++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); congestion_drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index c5bc424e3b3c..4002df3c7d9f 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -82,7 +82,8 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - qdisc_drop(skb, sch); + kfree_skb(skb); + qdisc_qstats_drop(sch); } static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) @@ -107,7 +108,8 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) return skb; } -static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct codel_sched_data *q; @@ -117,7 +119,7 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) } q = qdisc_priv(sch); q->drop_overlimit++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 22609e4e845f..8af5c59eef84 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -350,7 +350,8 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, return NULL; } -static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; @@ -360,11 +361,11 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return err; } - err = qdisc_enqueue(skb, cl->qdisc); + err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index b9ba5f658528..1308bbf460f7 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -191,7 +191,8 @@ static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch, /* --------------------------- Qdisc operations ---------------------------- */ -static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct dsmark_qdisc_data *p = qdisc_priv(sch); int err; @@ -234,7 +235,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) #ifdef CONFIG_NET_CLS_ACT case TC_ACT_QUEUED: case TC_ACT_STOLEN: - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: @@ -251,7 +252,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) } } - err = qdisc_enqueue(skb, p->q); + err = qdisc_enqueue(skb, p->q, to_free); if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) qdisc_qstats_drop(sch); @@ -264,7 +265,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index dea70e3ef0ba..6ea0db427f91 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -19,29 +19,32 @@ /* 1 band FIFO pseudo-"scheduler" */ -static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) return qdisc_enqueue_tail(skb, sch); - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } -static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { if (likely(skb_queue_len(&sch->q) < sch->limit)) return qdisc_enqueue_tail(skb, sch); - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } -static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { if (likely(skb_queue_len(&sch->q) < sch->limit)) return qdisc_enqueue_tail(skb, sch); /* queue full, remove one skb to fulfill the limit */ - __qdisc_queue_drop_head(sch, &sch->q); + __qdisc_queue_drop_head(sch, &sch->q, to_free); qdisc_qstats_drop(sch); qdisc_enqueue_tail(skb, sch); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 6eb06674f778..e5458b99e09c 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -368,18 +368,19 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) } } -static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct fq_sched_data *q = qdisc_priv(sch); struct fq_flow *f; if (unlikely(sch->q.qlen >= sch->limit)) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); f = fq_classify(skb, q); if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { q->stat_flows_plimit++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } f->qlen++; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 2dc0a849515a..f715195459c9 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -139,7 +139,8 @@ static inline void flow_queue_add(struct fq_codel_flow *flow, skb->next = NULL; } -static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) +static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, + struct sk_buff **to_free) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; @@ -172,7 +173,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) skb = dequeue_head(flow); len += qdisc_pkt_len(skb); mem += skb->truesize; - kfree_skb(skb); + __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); flow->dropped += i; @@ -184,7 +185,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) return idx; } -static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct fq_codel_sched_data *q = qdisc_priv(sch); unsigned int idx, prev_backlog, prev_qlen; @@ -197,7 +199,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (idx == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } idx--; @@ -229,7 +231,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) * So instead of dropping a single packet, drop half of its backlog * with a 64 packets limit to not add a too big cpu spike here. */ - ret = fq_codel_drop(sch, q->drop_batch_size); + ret = fq_codel_drop(sch, q->drop_batch_size, to_free); prev_qlen -= sch->q.qlen; prev_backlog -= sch->qstats.backlog; @@ -276,7 +278,8 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - qdisc_drop(skb, sch); + kfree_skb(skb); + qdisc_qstats_drop(sch); } static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 773b632e1e33..ff86606954f2 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -348,9 +348,10 @@ EXPORT_SYMBOL(netif_carrier_off); cheaper. */ -static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) +static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, + struct sk_buff **to_free) { - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_CN; } @@ -439,7 +440,8 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, return priv->q + band; } -static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) +static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, + struct sk_buff **to_free) { if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { int band = prio2band[skb->priority & TC_PRIO_MAX]; @@ -451,7 +453,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) return __qdisc_enqueue_tail(skb, qdisc, list); } - return qdisc_drop(skb, qdisc); + return qdisc_drop(skb, qdisc, to_free); } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index b5fb63c7be02..c78a093c551a 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -149,7 +149,8 @@ static inline int gred_use_harddrop(struct gred_sched *t) return t->red_flags & TC_RED_HARDDROP; } -static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct gred_sched_data *q = NULL; struct gred_sched *t = qdisc_priv(sch); @@ -237,10 +238,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->stats.pdrop++; drop: - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); congestion_drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index bd08c363a26d..8cb5eff7b79c 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1572,7 +1572,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) } static int -hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct hfsc_class *cl; int uninitialized_var(err); @@ -1581,11 +1581,11 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return err; } - err = qdisc_enqueue(skb, cl->qdisc); + err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index c44593b8e65a..e3d0458af17b 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -345,7 +345,7 @@ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) skb->next = NULL; } -static unsigned int hhf_drop(struct Qdisc *sch) +static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); struct wdrr_bucket *bucket; @@ -359,16 +359,16 @@ static unsigned int hhf_drop(struct Qdisc *sch) struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; - qdisc_qstats_drop(sch); qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); + qdisc_drop(skb, sch, to_free); } /* Return id of the bucket from which the packet was dropped. */ return bucket - q->buckets; } -static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; @@ -406,7 +406,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* Return Congestion Notification only if we dropped a packet from this * bucket. */ - if (hhf_drop(sch) == idx) + if (hhf_drop(sch, to_free) == idx) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index a454605ab5cb..f3882259c385 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -569,7 +569,8 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) list_del_init(&cl->un.leaf.drop_list); } -static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { int uninitialized_var(ret); struct htb_sched *q = qdisc_priv(sch); @@ -581,16 +582,17 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) __skb_queue_tail(&q->direct_queue, skb); q->direct_pkts++; } else { - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } #ifdef CONFIG_NET_CLS_ACT } else if (!cl) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; #endif - } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) { + } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q, + to_free)) != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) { qdisc_qstats_drop(sch); cl->qstats.drops++; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 5ea93305d705..9ffbb025b37e 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -65,7 +65,8 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) } static int -multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct Qdisc *qdisc; int ret; @@ -76,12 +77,12 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } #endif - ret = qdisc_enqueue(skb, qdisc); + ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; return NET_XMIT_SUCCESS; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e271967439bf..ccca8ca4c722 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -397,7 +397,8 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) * when we statistically choose to corrupt one, we instead segment it, returning * the first packet to be corrupted, and re-enqueue the remaining frames */ -static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) +static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct sk_buff *segs; netdev_features_t features = netif_skb_features(skb); @@ -405,7 +406,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NULL; } consume_skb(skb); @@ -418,7 +419,8 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) * NET_XMIT_DROP: queue length didn't change. * NET_XMIT_SUCCESS: one skb was queued. */ -static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct netem_sched_data *q = qdisc_priv(sch); /* We don't fill cb now as skb_unshare() may invalidate it */ @@ -443,7 +445,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } if (count == 0) { qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } @@ -463,7 +465,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; - rootq->enqueue(skb2, rootq); + rootq->enqueue(skb2, rootq, to_free); q->duplicate = dupsave; } @@ -475,7 +477,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) */ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { if (skb_is_gso(skb)) { - segs = netem_segment(skb, sch); + segs = netem_segment(skb, sch, to_free); if (!segs) return NET_XMIT_DROP; } else { @@ -488,7 +490,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))) { - rc = qdisc_drop(skb, sch); + rc = qdisc_drop(skb, sch, to_free); goto finish_segs; } @@ -497,7 +499,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); qdisc_qstats_backlog_inc(sch, skb); @@ -557,7 +559,7 @@ finish_segs: segs->next = NULL; qdisc_skb_cb(segs)->pkt_len = segs->len; last_len = segs->len; - rc = qdisc_enqueue(segs, sch); + rc = qdisc_enqueue(segs, sch, to_free); if (rc != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(rc)) qdisc_qstats_drop(sch); @@ -615,8 +617,11 @@ deliver: #endif if (q->qdisc) { - int err = qdisc_enqueue(skb, q->qdisc); + struct sk_buff *to_free = NULL; + int err; + err = qdisc_enqueue(skb, q->qdisc, &to_free); + kfree_skb_list(to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { qdisc_qstats_drop(sch); diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 912a46a5d02e..a570b0bb254c 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -134,7 +134,8 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size) return false; } -static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct pie_sched_data *q = qdisc_priv(sch); bool enqueue = false; @@ -166,7 +167,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) out: q->stats.dropped++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index a12cd37680f8..1c6cbab3e7b9 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -88,7 +88,8 @@ struct plug_sched_data { u32 pkts_to_release; }; -static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct plug_sched_data *q = qdisc_priv(sch); @@ -98,7 +99,7 @@ static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) return qdisc_enqueue_tail(skb, sch); } - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } static struct sk_buff *plug_dequeue(struct Qdisc *sch) diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index de492682caee..f4d443aeae54 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -67,7 +67,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) } static int -prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) +prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct Qdisc *qdisc; int ret; @@ -83,7 +83,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) } #endif - ret = qdisc_enqueue(skb, qdisc); + ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 0427fa8b23f2..f27ffee106f6 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1217,7 +1217,8 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) return agg; } -static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; @@ -1240,11 +1241,11 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) qdisc_pkt_len(skb)); if (err) { cl->qstats.drops++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } } - err = qdisc_enqueue(skb, cl->qdisc); + err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); if (net_xmit_drop_count(err)) { diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index a0d57530335e..249b2a18acbd 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -56,7 +56,8 @@ static inline int red_use_harddrop(struct red_sched_data *q) return q->flags & TC_RED_HARDDROP; } -static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; @@ -95,7 +96,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; } - ret = qdisc_enqueue(skb, child); + ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; @@ -106,7 +107,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) return ret; congestion_drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index c69611640fa5..add3cc7d37ec 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -275,7 +275,8 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, return false; } -static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct sfb_sched_data *q = qdisc_priv(sch); @@ -397,7 +398,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } enqueue: - ret = qdisc_enqueue(skb, child); + ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { sch->q.qlen++; increment_qlen(skb, q); @@ -408,7 +409,7 @@ enqueue: return ret; drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 57d118b41cad..7f195ed4d568 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -343,7 +343,7 @@ static int sfq_headdrop(const struct sfq_sched_data *q) } static int -sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned int hash, dropped; @@ -367,7 +367,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; @@ -424,14 +424,14 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (slot->qlen >= q->maxdepth) { congestion_drop: if (!sfq_headdrop(q)) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); /* We know we have at least one packet in queue */ head = slot_dequeue_head(slot); delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); sch->qstats.backlog -= delta; slot->backlog -= delta; - qdisc_drop(head, sch); + qdisc_drop(head, sch, to_free); slot_queue_add(slot, skb); return NET_XMIT_CN; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index c12df84d1078..303355c449ab 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -155,7 +155,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ -static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) +static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *segs, *nskb; @@ -166,7 +167,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); nb = 0; while (segs) { @@ -174,7 +175,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) segs->next = NULL; qdisc_skb_cb(segs)->pkt_len = segs->len; len += segs->len; - ret = qdisc_enqueue(segs, q->qdisc); + ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); @@ -190,17 +191,18 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } -static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); int ret; if (qdisc_pkt_len(skb) > q->max_size) { if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) - return tbf_segment(skb, sch); - return qdisc_drop(skb, sch); + return tbf_segment(skb, sch, to_free); + return qdisc_drop(skb, sch, to_free); } - ret = qdisc_enqueue(skb, q->qdisc); + ret = qdisc_enqueue(skb, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index e02687185a59..2cd9b4478b92 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -77,7 +77,7 @@ struct teql_sched_data { /* "teql*" qdisc routines */ static int -teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) +teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct net_device *dev = qdisc_dev(sch); struct teql_sched_data *q = qdisc_priv(sch); @@ -87,7 +87,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } static struct sk_buff * -- cgit v1.2.3 From 008830bc321c0fc22c0db8d5b0b56f854ed90a5c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Jun 2016 23:16:50 -0700 Subject: net_sched: fq_codel: cache skb->truesize into skb->cb Now we defer skb drops, it makes sense to keep a copy of skb->truesize in struct codel_skb_cb to avoid one cache line miss per dropped skb in fq_codel_drop(), to reduce latencies a bit further. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq_codel.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index f715195459c9..a5ea0e9b6be4 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -172,7 +172,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, do { skb = dequeue_head(flow); len += qdisc_pkt_len(skb); - mem += skb->truesize; + mem += get_codel_cb(skb)->mem_usage; __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); @@ -216,7 +216,8 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->deficit = q->quantum; flow->dropped = 0; } - q->memory_usage += skb->truesize; + get_codel_cb(skb)->mem_usage = skb->truesize; + q->memory_usage += get_codel_cb(skb)->mem_usage; memory_limited = q->memory_usage > q->memory_limit; if (++sch->q.qlen <= sch->limit && !memory_limited) return NET_XMIT_SUCCESS; @@ -267,7 +268,7 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) if (flow->head) { skb = dequeue_head(flow); q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); - q->memory_usage -= skb->truesize; + q->memory_usage -= get_codel_cb(skb)->mem_usage; sch->q.qlen--; sch->qstats.backlog -= qdisc_pkt_len(skb); } -- cgit v1.2.3 From 338ed9b4de57c4b7965b96364593e27de0d89582 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Jun 2016 23:16:51 -0700 Subject: net_sched: sch_htb: export class backlog in dumps We already get child qdisc qlen, we also can get its backlog so that class dumps can report it. Also replace qstats by a single drop counter, but move it in a separate cache line so that drops do not dirty useful cache lines. Tested: $ tc -s cl sh dev eth0 class htb 1:1 root leaf 3: prio 0 rate 1Gbit ceil 1Gbit burst 500000b cburst 500000b Sent 2183346912 bytes 9021815 pkt (dropped 2340774, overlimits 0 requeues 0) rate 1001Mbit 517543pps backlog 120758b 499p requeues 0 lended: 9021770 borrowed: 0 giants: 0 tokens: 9 ctokens: 9 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index f3882259c385..ba098f2654b4 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -117,7 +117,6 @@ struct htb_class { * Written often fields */ struct gnet_stats_basic_packed bstats; - struct gnet_stats_queue qstats; struct tc_htb_xstats xstats; /* our special stats */ /* token bucket parameters */ @@ -140,6 +139,8 @@ struct htb_class { enum htb_cmode cmode; /* current mode of the class */ struct rb_node pq_node; /* node for event queue */ struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ + + unsigned int drops ____cacheline_aligned_in_smp; }; struct htb_level { @@ -595,7 +596,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, to_free)) != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) { qdisc_qstats_drop(sch); - cl->qstats.drops++; + cl->drops++; } return ret; } else { @@ -1110,17 +1111,22 @@ static int htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct htb_class *cl = (struct htb_class *)arg; + struct gnet_stats_queue qs = { + .drops = cl->drops, + }; __u32 qlen = 0; - if (!cl->level && cl->un.leaf.q) + if (!cl->level && cl->un.leaf.q) { qlen = cl->un.leaf.q->q.qlen; + qs.backlog = cl->un.leaf.q->qstats.backlog; + } cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) + gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); -- cgit v1.2.3 From 4d202a0d31b96ab3324b21e7500d9a2da9ef57dd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Jun 2016 23:16:52 -0700 Subject: net_sched: generalize bulk dequeue When qdisc bulk dequeue was added in linux-3.18 (commit 5772e9a3463b "qdisc: bulk dequeue support for qdiscs with TCQ_F_ONETXQUEUE"), it was constrained to some specific qdiscs. With some extra care, we can extend this to all qdiscs, so that typical traffic shaping solutions can benefit from small batches (8 packets in this patch). For example, HTB is often used on some multi queue device. And bonding/team are multi queue devices... Idea is to bulk-dequeue packets mapping to the same transmit queue. This brings between 35 and 80 % performance increase in HTB setup under pressure on a bonding setup : 1) NUMA node contention : 610,000 pps -> 1,110,000 pps 2) No node contention : 1,380,000 pps -> 1,930,000 pps Now we should work to add batches on the enqueue() side ;) Signed-off-by: Eric Dumazet Cc: John Fastabend Cc: Jesper Dangaard Brouer Cc: Hannes Frederic Sowa Cc: Florian Westphal Cc: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 68 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ff86606954f2..e95b67cd5718 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -77,6 +77,34 @@ static void try_bulk_dequeue_skb(struct Qdisc *q, skb->next = NULL; } +/* This variant of try_bulk_dequeue_skb() makes sure + * all skbs in the chain are for the same txq + */ +static void try_bulk_dequeue_skb_slow(struct Qdisc *q, + struct sk_buff *skb, + int *packets) +{ + int mapping = skb_get_queue_mapping(skb); + struct sk_buff *nskb; + int cnt = 0; + + do { + nskb = q->dequeue(q); + if (!nskb) + break; + if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { + q->skb_bad_txq = nskb; + qdisc_qstats_backlog_inc(q, nskb); + q->q.qlen++; + break; + } + skb->next = nskb; + skb = nskb; + } while (++cnt < 8); + (*packets) += cnt; + skb->next = NULL; +} + /* Note that dequeue_skb can possibly return a SKB list (via skb->next). * A requeued skb (via q->gso_skb) can also be a SKB list. */ @@ -87,8 +115,9 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, const struct netdev_queue *txq = q->dev_queue; *packets = 1; - *validate = true; if (unlikely(skb)) { + /* skb in gso_skb were already validated */ + *validate = false; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { @@ -97,15 +126,30 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, q->q.qlen--; } else skb = NULL; - /* skb in gso_skb were already validated */ - *validate = false; - } else { - if (!(q->flags & TCQ_F_ONETXQUEUE) || - !netif_xmit_frozen_or_stopped(txq)) { - skb = q->dequeue(q); - if (skb && qdisc_may_bulk(q)) - try_bulk_dequeue_skb(q, skb, txq, packets); + return skb; + } + *validate = true; + skb = q->skb_bad_txq; + if (unlikely(skb)) { + /* check the reason of requeuing without tx lock first */ + txq = skb_get_tx_queue(txq->dev, skb); + if (!netif_xmit_frozen_or_stopped(txq)) { + q->skb_bad_txq = NULL; + qdisc_qstats_backlog_dec(q, skb); + q->q.qlen--; + goto bulk; } + return NULL; + } + if (!(q->flags & TCQ_F_ONETXQUEUE) || + !netif_xmit_frozen_or_stopped(txq)) + skb = q->dequeue(q); + if (skb) { +bulk: + if (qdisc_may_bulk(q)) + try_bulk_dequeue_skb(q, skb, txq, packets); + else + try_bulk_dequeue_skb_slow(q, skb, packets); } return skb; } @@ -624,11 +668,14 @@ void qdisc_reset(struct Qdisc *qdisc) if (ops->reset) ops->reset(qdisc); + kfree_skb(qdisc->skb_bad_txq); + qdisc->skb_bad_txq = NULL; + if (qdisc->gso_skb) { kfree_skb_list(qdisc->gso_skb); qdisc->gso_skb = NULL; - qdisc->q.qlen = 0; } + qdisc->q.qlen = 0; } EXPORT_SYMBOL(qdisc_reset); @@ -667,6 +714,7 @@ void qdisc_destroy(struct Qdisc *qdisc) dev_put(qdisc_dev(qdisc)); kfree_skb_list(qdisc->gso_skb); + kfree_skb(qdisc->skb_bad_txq); /* * gen_estimator est_timer() might access qdisc->q.lock, * wait a RCU grace period before freeing qdisc. -- cgit v1.2.3 From 810bf11033637a2069952afb9c37f3afd3bbfe41 Mon Sep 17 00:00:00 2001 From: Amitoj Kaur Chawla Date: Thu, 23 Jun 2016 10:19:37 +0530 Subject: tipc: Use kmemdup instead of kmalloc and memcpy Replace calls to kmalloc followed by a memcpy with a direct call to kmemdup. The Coccinelle semantic patch used to make this change is as follows: @@ expression from,to,size,flag; statement S; @@ - to = \(kmalloc\|kzalloc\)(size,flag); + to = kmemdup(from,size,flag); if (to==NULL || ...) S - memcpy(to, from, size); Signed-off-by: Amitoj Kaur Chawla Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/server.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 272d20a795d5..215849ce453d 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -418,13 +418,12 @@ static struct outqueue_entry *tipc_alloc_entry(void *data, int len) if (!entry) return NULL; - buf = kmalloc(len, GFP_ATOMIC); + buf = kmemdup(data, len, GFP_ATOMIC); if (!buf) { kfree(entry); return NULL; } - memcpy(buf, data, len); entry->iov.iov_base = buf; entry->iov.iov_len = len; -- cgit v1.2.3 From 637c841dd7a5f9bd97b75cbe90b526fa1a52e530 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 23 Jun 2016 18:42:51 -0700 Subject: net: diag: Add support to filter on device index Add support to inet_diag facility to filter sockets based on device index. If an interface index is in the filter only sockets bound to that index (sk_bound_dev_if) are returned. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 25af1243649b..38c2c47fe0e8 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -44,6 +44,7 @@ struct inet_diag_entry { u16 dport; u16 family; u16 userlocks; + u32 ifindex; }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -571,6 +572,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } + case INET_DIAG_BC_DEV_COND: { + u32 ifindex; + + ifindex = *((const u32 *)(op + 1)); + if (ifindex != entry->ifindex) + yes = 0; + break; + } } if (yes) { @@ -613,6 +622,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry_fill_addrs(&entry, sk); entry.sport = inet->inet_num; entry.dport = ntohs(inet->inet_dport); + entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; return inet_diag_bc_run(bc, &entry); @@ -636,6 +646,17 @@ static int valid_cc(const void *bc, int len, int cc) return 0; } +/* data is u32 ifindex */ +static bool valid_devcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + /* Check ifindex space. */ + *min_len += sizeof(u32); + if (len < *min_len) + return false; + + return true; +} /* Validate an inet_diag_hostcond. */ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, int *min_len) @@ -700,6 +721,10 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) if (!valid_hostcond(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_DEV_COND: + if (!valid_devcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_GE: -- cgit v1.2.3 From 56e2f23b7225d2e7b42826aee065cbf96834114d Mon Sep 17 00:00:00 2001 From: Amitoj Kaur Chawla Date: Fri, 24 Jun 2016 11:53:54 +0530 Subject: caif: Remove unneeded header file Drop redundant include of moduleparam.h The Coccinelle semantic patch used to make this change is as follows: @ includesmodule @ @@ #include @ depends on includesmodule @ @@ - #include Signed-off-by: Amitoj Kaur Chawla Signed-off-by: David S. Miller --- net/caif/chnl_net.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 67a4a36febd1..3408ed51b611 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 8a01ed70ebe4ddf37a759e8e9b4e8e71fb26b47c Mon Sep 17 00:00:00 2001 From: Wei Tang Date: Mon, 27 Jun 2016 18:12:46 +0800 Subject: net: the space is required before the open parenthesis '(' The space is missing before the open parenthesis '(', and this will introduce much more noise when checking patch around. Signed-off-by: Wei Tang Signed-off-by: David S. Miller --- net/core/utils.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/utils.c b/net/core/utils.c index 3d17ca8b4744..cf5622b9ccc4 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -133,7 +133,7 @@ int in4_pton(const char *src, int srclen, s = src; d = dbuf; i = 0; - while(1) { + while (1) { int c; c = xdigit2bin(srclen > 0 ? *s : '\0', delim); if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) { @@ -283,11 +283,11 @@ cont: i = 15; d--; if (dc) { - while(d >= dc) + while (d >= dc) dst[i--] = *d--; - while(i >= dc - dbuf) + while (i >= dc - dbuf) dst[i--] = 0; - while(i >= 0) + while (i >= 0) dst[i--] = *d--; } else memcpy(dst, dbuf, sizeof(dbuf)); -- cgit v1.2.3 From e99429232e3622a7e390c3b540c4971b1ccf75c8 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Mon, 27 Jun 2016 13:34:06 +0200 Subject: tipc: honor msg2addr return value The UDP msg2addr function tipc_udp_msg2addr() can return -EINVAL which prior to this patch was unhanded in the caller. Signed-off-by: Richard Alpe Acked-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/discover.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index ad9d477cc242..6b109a808d4c 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -135,9 +135,12 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, u16 caps = msg_node_capabilities(hdr); bool respond = false; bool dupl_addr = false; + int err; - bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr)); + err = bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr)); kfree_skb(skb); + if (err) + return; /* Ensure message from node is valid and communication is permitted */ if (net_id != tn->net_id) -- cgit v1.2.3 From bc3a334cc2c49779c90d7057c42c4537cd36256f Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Mon, 27 Jun 2016 13:34:07 +0200 Subject: tipc: rename udp_port in struct udp_media_addr Context implies that port in struct "udp_media_addr" is referring to a UDP port. Signed-off-by: Richard Alpe Acked-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/udp_media.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index c9cf2be3674a..b016c011970b 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -63,7 +63,7 @@ */ struct udp_media_addr { __be16 proto; - __be16 udp_port; + __be16 port; union { struct in_addr ipv4; struct in6_addr ipv6; @@ -108,9 +108,9 @@ static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size) struct udp_media_addr *ua = (struct udp_media_addr *)&a->value; if (ntohs(ua->proto) == ETH_P_IP) - snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->udp_port)); + snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->port)); else if (ntohs(ua->proto) == ETH_P_IPV6) - snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->udp_port)); + snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->port)); else pr_err("Invalid UDP media address\n"); return 0; @@ -178,8 +178,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, skb->dev = rt->dst.dev; ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, - dst->ipv4.s_addr, 0, ttl, 0, src->udp_port, - dst->udp_port, false, true); + dst->ipv4.s_addr, 0, ttl, 0, src->port, + dst->port, false, true); #if IS_ENABLED(CONFIG_IPV6) } else { struct dst_entry *ndst; @@ -196,8 +196,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, ndst->dev, &src->ipv6, - &dst->ipv6, 0, ttl, 0, src->udp_port, - dst->udp_port, false); + &dst->ipv6, 0, ttl, 0, src->port, + dst->port, false); #endif } return err; @@ -292,12 +292,12 @@ err: ip4 = (struct sockaddr_in *)&sa_local; local->proto = htons(ETH_P_IP); - local->udp_port = ip4->sin_port; + local->port = ip4->sin_port; local->ipv4.s_addr = ip4->sin_addr.s_addr; ip4 = (struct sockaddr_in *)&sa_remote; remote->proto = htons(ETH_P_IP); - remote->udp_port = ip4->sin_port; + remote->port = ip4->sin_port; remote->ipv4.s_addr = ip4->sin_addr.s_addr; return 0; @@ -312,13 +312,13 @@ err: return -EINVAL; local->proto = htons(ETH_P_IPV6); - local->udp_port = ip6->sin6_port; + local->port = ip6->sin6_port; memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); ub->ifindex = ip6->sin6_scope_id; ip6 = (struct sockaddr_in6 *)&sa_remote; remote->proto = htons(ETH_P_IPV6); - remote->udp_port = ip6->sin6_port; + remote->port = ip6->sin6_port; memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); return 0; #endif @@ -386,7 +386,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, err = -EAFNOSUPPORT; goto err; } - udp_conf.local_udp_port = local.udp_port; + udp_conf.local_udp_port = local.port; err = udp_sock_create(net, &udp_conf, &ub->ubsock); if (err) goto err; -- cgit v1.2.3 From 8a6e9c670341db1ee913e3888cb44a08f18e7489 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Jun 2016 10:30:08 +0200 Subject: net_sched: netem: do not call qdisc_drop() with a NULL skb If skb_unshare() fails, we call qdisc_drop() with a NULL skb, which is no longer supported. Fixes: 520ac30f4551 ("net_sched: drop packets after root qdisc lock is released") Signed-off-by: Eric Dumazet Reported-by: Dan Carpenter Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index ccca8ca4c722..6eac3d880048 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -487,10 +487,14 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb = segs; segs = segs->next; - if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || - (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb))) { - rc = qdisc_drop(skb, sch, to_free); + skb = skb_unshare(skb, GFP_ATOMIC); + if (unlikely(!skb)) { + qdisc_qstats_drop(sch); + goto finish_segs; + } + if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(skb)) { + qdisc_drop(skb, sch, to_free); goto finish_segs; } -- cgit v1.2.3 From 7bed2ab8c62aed4e9ba55373d2a81faf481c3041 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Thu, 9 Jun 2016 15:46:49 +0200 Subject: batman-adv: Start new development cycle Signed-off-by: Simon Wunderlich Signed-off-by: Sven Eckelmann --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 76925266deed..c356d91ed780 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2016.2" +#define BATADV_SOURCE_VERSION "2016.3" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.2.3 From 92d2b1a5b35e05a5c4506cd73a6a5dc5f8394b60 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Wed, 25 May 2016 23:27:33 +0800 Subject: batman-adv: statically print gateway table header To make it easier to search through the code it is better to print static strings directly instead of using format strings printing constants. This was addressed in a previous patch, but the Gateway table header was not updated accordingly. Signed-off-by: Antonio Quartulli Reviewed-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/gateway_client.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 5839c569f769..b9d72e9536b6 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -638,8 +638,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) goto out; seq_printf(seq, - " %-12s (%s/%i) %17s [%10s]: advertised uplink bandwidth ... [B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s)]\n", - "Gateway", "#", BATADV_TQ_MAX_VALUE, "Nexthop", "outgoingIF", + " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth ... [B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s)]\n", BATADV_SOURCE_VERSION, primary_if->net_dev->name, primary_if->net_dev->dev_addr, net_dev->name); -- cgit v1.2.3 From 118dc950fc2156de9c2f1d31be54aa37f2d3d116 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Fri, 20 May 2016 20:42:19 +0200 Subject: batman-adv: remove unused vid local variable in tt seq print Signed-off-by: Simon Wunderlich Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner --- net/batman-adv/translation-table.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index feaf492b01ca..87bb2030186d 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -995,7 +995,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) struct batadv_tt_local_entry *tt_local; struct batadv_hard_iface *primary_if; struct hlist_head *head; - unsigned short vid; u32 i; int last_seen_secs; int last_seen_msecs; @@ -1022,7 +1021,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) tt_local = container_of(tt_common_entry, struct batadv_tt_local_entry, common); - vid = tt_common_entry->vid; last_seen_jiffies = jiffies - tt_local->last_seen; last_seen_msecs = jiffies_to_msecs(last_seen_jiffies); last_seen_secs = last_seen_msecs / 1000; -- cgit v1.2.3 From 6f0a6b5ee84fee29abd634e69ea67d6cc87817a5 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Tue, 3 May 2016 01:52:08 +0800 Subject: batman-adv: refactor batadv_neigh_node_* functions to follow common style Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 3 ++- net/batman-adv/bat_v_elp.c | 3 ++- net/batman-adv/bat_v_ogm.c | 4 ++-- net/batman-adv/originator.c | 35 +++++++++++++++++++++++++++++------ net/batman-adv/originator.h | 6 +++--- 5 files changed, 38 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index ce2f203048d3..aa11296e6bf8 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -336,7 +336,8 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, { struct batadv_neigh_node *neigh_node; - neigh_node = batadv_neigh_node_new(orig_node, hard_iface, neigh_addr); + neigh_node = batadv_neigh_node_get_or_create(orig_node, + hard_iface, neigh_addr); if (!neigh_node) goto out; diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index df42eb1365a0..8909d1eb9622 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -443,7 +443,8 @@ static void batadv_v_elp_neigh_update(struct batadv_priv *bat_priv, if (!orig_neigh) return; - neigh = batadv_neigh_node_new(orig_neigh, if_incoming, neigh_addr); + neigh = batadv_neigh_node_get_or_create(orig_neigh, + if_incoming, neigh_addr); if (!neigh) goto orig_free; diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 473ebb9a0e73..23ea9bfb9f67 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -683,8 +683,8 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, if (!orig_node) return; - neigh_node = batadv_neigh_node_new(orig_node, if_incoming, - ethhdr->h_source); + neigh_node = batadv_neigh_node_get_or_create(orig_node, if_incoming, + ethhdr->h_source); if (!neigh_node) goto out; diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 7f51bc2c06eb..b0bad5726557 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -602,19 +602,19 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, } /** - * batadv_neigh_node_new - create and init a new neigh_node object + * batadv_neigh_node_create - create a neigh node object * @orig_node: originator object representing the neighbour * @hard_iface: the interface where the neighbour is connected to * @neigh_addr: the mac address of the neighbour interface * * Allocates a new neigh_node object and initialises all the generic fields. * - * Return: neighbor when found. Othwerwise NULL + * Return: the neighbour node if found or created or NULL otherwise. */ -struct batadv_neigh_node * -batadv_neigh_node_new(struct batadv_orig_node *orig_node, - struct batadv_hard_iface *hard_iface, - const u8 *neigh_addr) +static struct batadv_neigh_node * +batadv_neigh_node_create(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr) { struct batadv_neigh_node *neigh_node; struct batadv_hardif_neigh_node *hardif_neigh = NULL; @@ -666,6 +666,29 @@ out: return neigh_node; } +/** + * batadv_neigh_node_get_or_create - retrieve or create a neigh node object + * @orig_node: originator object representing the neighbour + * @hard_iface: the interface where the neighbour is connected to + * @neigh_addr: the mac address of the neighbour interface + * + * Return: the neighbour node if found or created or NULL otherwise. + */ +struct batadv_neigh_node * +batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr) +{ + struct batadv_neigh_node *neigh_node = NULL; + + /* first check without locking to avoid the overhead */ + neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); + if (neigh_node) + return neigh_node; + + return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr); +} + /** * batadv_hardif_neigh_seq_print_text - print the single hop neighbour list * @seq: neighbour table seq_file struct diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 64a8951e5844..566306bf05dc 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -46,9 +46,9 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh); struct batadv_neigh_node * -batadv_neigh_node_new(struct batadv_orig_node *orig_node, - struct batadv_hard_iface *hard_iface, - const u8 *neigh_addr); +batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr); void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, -- cgit v1.2.3 From d9f179877e50ae2681fe7b0b83e0d9f63b6165ad Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Mon, 2 May 2016 21:58:50 +0800 Subject: batman-adv: remove unused callback from batadv_algo_ops struct Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/originator.c | 5 ----- net/batman-adv/types.h | 3 --- 2 files changed, 8 deletions(-) (limited to 'net') diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index b0bad5726557..076d258c92e1 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -251,10 +251,8 @@ static void batadv_neigh_node_release(struct kref *ref) struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; - struct batadv_algo_ops *bao; neigh_node = container_of(ref, struct batadv_neigh_node, refcount); - bao = neigh_node->orig_node->bat_priv->bat_algo_ops; hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { @@ -263,9 +261,6 @@ static void batadv_neigh_node_release(struct kref *ref) batadv_hardif_neigh_put(neigh_node->hardif_neigh); - if (bao->bat_neigh_free) - bao->bat_neigh_free(neigh_node); - batadv_hardif_put(neigh_node->if_incoming); kfree_rcu(neigh_node, rcu); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 6a577f4f8ba7..114d1509946b 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1278,8 +1278,6 @@ struct batadv_forw_packet { * better than neigh2 for their respective outgoing interface from the metric * prospective * @bat_neigh_print: print the single hop neighbor list (optional) - * @bat_neigh_free: free the resources allocated by the routing algorithm for a - * neigh_node object * @bat_orig_print: print the originator table (optional) * @bat_orig_free: free the resources allocated by the routing algorithm for an * orig_node object @@ -1310,7 +1308,6 @@ struct batadv_algo_ops { struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); void (*bat_neigh_print)(struct batadv_priv *priv, struct seq_file *seq); - void (*bat_neigh_free)(struct batadv_neigh_node *neigh); /* orig_node handling API */ void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq, struct batadv_hard_iface *hard_iface); -- cgit v1.2.3 From f0d97253fb5fe87a7a91e7dc1ba4becf9d89d896 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 3 May 2016 01:45:34 +0800 Subject: batman-adv: remove ogm_emit and ogm_schedule API calls The ogm_emit and ogm_schedule API calls were rather tight to the B.A.T.M.A.N. IV logic and therefore rather difficult to use with other algorithm implementations. Remove such calls and move the surrounding logic into the B.A.T.M.A.N. IV specific code. Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 69 ++++++++++++++++++++++++++++++++++++++--- net/batman-adv/bat_v.c | 10 ------ net/batman-adv/hard-interface.c | 3 -- net/batman-adv/main.c | 2 -- net/batman-adv/send.c | 61 +----------------------------------- net/batman-adv/send.h | 4 +-- net/batman-adv/types.h | 4 --- 7 files changed, 67 insertions(+), 86 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index aa11296e6bf8..4815db978c27 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ #include "send.h" #include "translation-table.h" +static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work); + /** * enum batadv_dup_status - duplicate status * @BATADV_NO_DUP: the packet is no duplicate @@ -731,7 +734,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, /* start timer for this packet */ INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work, - batadv_send_outstanding_bat_ogm_packet); + batadv_iv_send_outstanding_bat_ogm_packet); queue_delayed_work(batadv_event_workqueue, &forw_packet_aggr->delayed_work, send_time - jiffies); @@ -938,6 +941,19 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) u16 tvlv_len = 0; unsigned long send_time; + if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || + (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) + return; + + /* the interface gets activated here to avoid race conditions between + * the moment of activating the interface in + * hardif_activate_interface() where the originator mac is set and + * outdated packets (especially uninitialized mac addresses) in the + * packet queue + */ + if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) + hard_iface->if_status = BATADV_IF_ACTIVE; + primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { @@ -1779,6 +1795,45 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, batadv_orig_node_put(orig_node); } +static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work) +{ + struct delayed_work *delayed_work; + struct batadv_forw_packet *forw_packet; + struct batadv_priv *bat_priv; + + delayed_work = to_delayed_work(work); + forw_packet = container_of(delayed_work, struct batadv_forw_packet, + delayed_work); + bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); + spin_lock_bh(&bat_priv->forw_bat_list_lock); + hlist_del(&forw_packet->list); + spin_unlock_bh(&bat_priv->forw_bat_list_lock); + + if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) + goto out; + + batadv_iv_ogm_emit(forw_packet); + + /* we have to have at least one packet in the queue to determine the + * queues wake up time unless we are shutting down. + * + * only re-schedule if this is the "original" copy, e.g. the OGM of the + * primary interface should only be rescheduled once per period, but + * this function will be called for the forw_packet instances of the + * other secondary interfaces as well. + */ + if (forw_packet->own && + forw_packet->if_incoming == forw_packet->if_outgoing) + batadv_iv_ogm_schedule(forw_packet->if_incoming); + +out: + /* don't count own packet */ + if (!forw_packet->own) + atomic_inc(&bat_priv->batman_queue_left); + + batadv_forw_packet_free(forw_packet); +} + static int batadv_iv_ogm_receive(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { @@ -1795,7 +1850,8 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface * that does not have B.A.T.M.A.N. IV enabled ? */ - if (bat_priv->bat_algo_ops->bat_ogm_emit != batadv_iv_ogm_emit) + if (bat_priv->bat_algo_ops->bat_iface_enable != + batadv_iv_ogm_iface_enable) return NET_RX_DROP; batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); @@ -2053,14 +2109,19 @@ out: return ret; } +static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) +{ + /* begin scheduling originator messages on that interface */ + batadv_iv_ogm_schedule(hard_iface); +} + static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .name = "BATMAN_IV", + .bat_iface_activate = batadv_iv_iface_activate, .bat_iface_enable = batadv_iv_ogm_iface_enable, .bat_iface_disable = batadv_iv_ogm_iface_disable, .bat_iface_update_mac = batadv_iv_ogm_iface_update_mac, .bat_primary_iface_set = batadv_iv_ogm_primary_iface_set, - .bat_ogm_schedule = batadv_iv_ogm_schedule, - .bat_ogm_emit = batadv_iv_ogm_emit, .bat_neigh_cmp = batadv_iv_ogm_neigh_cmp, .bat_neigh_is_similar_or_better = batadv_iv_ogm_neigh_is_sob, .bat_neigh_print = batadv_iv_neigh_print, diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 0a12e5cdd65d..c16cd44a3b4c 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -119,14 +119,6 @@ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) batadv_v_elp_throughput_metric_update); } -static void batadv_v_ogm_schedule(struct batadv_hard_iface *hard_iface) -{ -} - -static void batadv_v_ogm_emit(struct batadv_forw_packet *forw_packet) -{ -} - /** * batadv_v_orig_print_neigh - print neighbors for the originator table * @orig_node: the orig_node for which the neighbors are printed @@ -340,8 +332,6 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = { .bat_iface_update_mac = batadv_v_iface_update_mac, .bat_primary_iface_set = batadv_v_primary_iface_set, .bat_hardif_neigh_init = batadv_v_hardif_neigh_init, - .bat_ogm_emit = batadv_v_ogm_emit, - .bat_ogm_schedule = batadv_v_ogm_schedule, .bat_orig_print = batadv_v_orig_print, .bat_neigh_cmp = batadv_v_neigh_cmp, .bat_neigh_is_similar_or_better = batadv_v_neigh_is_sob, diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 8c2f39962fa5..db2009d84a25 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -553,9 +553,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, batadv_hardif_recalc_extra_skbroom(soft_iface); - /* begin scheduling originator messages on that interface */ - batadv_schedule_bat_ogm(hard_iface); - out: return 0; diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 5f2974bd1227..627d14ececaf 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -569,8 +569,6 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) !bat_algo_ops->bat_iface_disable || !bat_algo_ops->bat_iface_update_mac || !bat_algo_ops->bat_primary_iface_set || - !bat_algo_ops->bat_ogm_schedule || - !bat_algo_ops->bat_ogm_emit || !bat_algo_ops->bat_neigh_cmp || !bat_algo_ops->bat_neigh_is_similar_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index b1a4e8a811c8..59e695b5cfbd 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -428,27 +428,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, orig_node, vid); } -void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface) -{ - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - - if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || - (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) - return; - - /* the interface gets activated here to avoid race conditions between - * the moment of activating the interface in - * hardif_activate_interface() where the originator mac is set and - * outdated packets (especially uninitialized mac addresses) in the - * packet queue - */ - if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) - hard_iface->if_status = BATADV_IF_ACTIVE; - - bat_priv->bat_algo_ops->bat_ogm_schedule(hard_iface); -} - -static void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) +void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) { kfree_skb(forw_packet->skb); if (forw_packet->if_incoming) @@ -604,45 +584,6 @@ out: atomic_inc(&bat_priv->bcast_queue_left); } -void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work) -{ - struct delayed_work *delayed_work; - struct batadv_forw_packet *forw_packet; - struct batadv_priv *bat_priv; - - delayed_work = to_delayed_work(work); - forw_packet = container_of(delayed_work, struct batadv_forw_packet, - delayed_work); - bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); - spin_lock_bh(&bat_priv->forw_bat_list_lock); - hlist_del(&forw_packet->list); - spin_unlock_bh(&bat_priv->forw_bat_list_lock); - - if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) - goto out; - - bat_priv->bat_algo_ops->bat_ogm_emit(forw_packet); - - /* we have to have at least one packet in the queue to determine the - * queues wake up time unless we are shutting down. - * - * only re-schedule if this is the "original" copy, e.g. the OGM of the - * primary interface should only be rescheduled once per period, but - * this function will be called for the forw_packet instances of the - * other secondary interfaces as well. - */ - if (forw_packet->own && - forw_packet->if_incoming == forw_packet->if_outgoing) - batadv_schedule_bat_ogm(forw_packet->if_incoming); - -out: - /* don't count own packet */ - if (!forw_packet->own) - atomic_inc(&bat_priv->batman_queue_left); - - batadv_forw_packet_free(forw_packet); -} - void batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, const struct batadv_hard_iface *hard_iface) diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 6fd7270d8ce6..7cecb7563b45 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -26,8 +26,8 @@ #include "packet.h" struct sk_buff; -struct work_struct; +void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet); int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); @@ -38,11 +38,9 @@ int batadv_send_broadcast_skb(struct sk_buff *skb, struct batadv_hard_iface *hard_iface); int batadv_send_unicast_skb(struct sk_buff *skb, struct batadv_neigh_node *neigh_node); -void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface); int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, const struct sk_buff *skb, unsigned long delay); -void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work); void batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, const struct batadv_hard_iface *hard_iface); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 114d1509946b..b70b6ae5edae 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1269,8 +1269,6 @@ struct batadv_forw_packet { * @bat_iface_update_mac: (re-)init mac addresses of the protocol information * belonging to this hard-interface * @bat_primary_iface_set: called when primary interface is selected / changed - * @bat_ogm_schedule: prepare a new outgoing OGM for the send queue - * @bat_ogm_emit: send scheduled OGM * @bat_hardif_neigh_init: called on creation of single hop entry * @bat_neigh_cmp: compare the metrics of two neighbors for their respective * outgoing interfaces @@ -1294,8 +1292,6 @@ struct batadv_algo_ops { void (*bat_iface_disable)(struct batadv_hard_iface *hard_iface); void (*bat_iface_update_mac)(struct batadv_hard_iface *hard_iface); void (*bat_primary_iface_set)(struct batadv_hard_iface *hard_iface); - void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface); - void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet); /* neigh_node handling API */ void (*bat_hardif_neigh_init)(struct batadv_hardif_neigh_node *neigh); int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1, -- cgit v1.2.3 From c149ca72e58ac511ad8bf71df833efa9764115a4 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Fri, 6 May 2016 02:46:37 +0800 Subject: batman-adv: remove useless inline attribute for sysfs helper function the compiler can optimize functions within the same C file and therefore there is no need to make it explicit. Remove the useless inline attribute for __batadv_store_uint_attr() Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/sysfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 414b2074165f..ef5832f4aaba 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -389,12 +389,12 @@ static int batadv_store_uint_attr(const char *buff, size_t count, return count; } -static inline ssize_t -__batadv_store_uint_attr(const char *buff, size_t count, - int min, int max, - void (*post_func)(struct net_device *), - const struct attribute *attr, - atomic_t *attr_store, struct net_device *net_dev) +static ssize_t __batadv_store_uint_attr(const char *buff, size_t count, + int min, int max, + void (*post_func)(struct net_device *), + const struct attribute *attr, + atomic_t *attr_store, + struct net_device *net_dev) { int ret; -- cgit v1.2.3 From 3a24a63e74af1bffc7aeb5d83adcd63b37e38425 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Fri, 6 May 2016 02:46:38 +0800 Subject: batman-adv: move GW mode and selection class to private data structure To reduce the field pollution in our main batadv_priv data structure we've already created some substructures so that we could group fields in a convenient manner. However gw_mode and gw_sel_class are still part of the main object. More both fields to the GW private substructure. Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/gateway_client.c | 12 ++++++------ net/batman-adv/gateway_common.c | 6 +++--- net/batman-adv/soft-interface.c | 6 +++--- net/batman-adv/sysfs.c | 10 +++++----- net/batman-adv/types.h | 8 ++++---- 5 files changed, 21 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index b9d72e9536b6..18c3715e5e27 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -192,7 +192,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) tq_avg = router_ifinfo->bat_iv.tq_avg; - switch (atomic_read(&bat_priv->gw_sel_class)) { + switch (atomic_read(&bat_priv->gw.sel_class)) { case 1: /* fast connection */ tmp_gw_factor = tq_avg * tq_avg; tmp_gw_factor *= gw_node->bandwidth_down; @@ -255,7 +255,7 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) { struct batadv_gw_node *curr_gw; - if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT) + if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) return; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); @@ -283,7 +283,7 @@ void batadv_gw_election(struct batadv_priv *bat_priv) struct batadv_neigh_ifinfo *router_ifinfo = NULL; char gw_addr[18] = { '\0' }; - if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT) + if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); @@ -402,8 +402,8 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv, /* if the routing class is greater than 3 the value tells us how much * greater the TQ value of the new gateway must be */ - if ((atomic_read(&bat_priv->gw_sel_class) > 3) && - (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw_sel_class))) + if ((atomic_read(&bat_priv->gw.sel_class) > 3) && + (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class))) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, @@ -820,7 +820,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, if (!gw_node) goto out; - switch (atomic_read(&bat_priv->gw_mode)) { + switch (atomic_read(&bat_priv->gw.mode)) { case BATADV_GW_MODE_SERVER: /* If we are a GW then we are our best GW. We can artificially * set the tq towards ourself as the maximum value diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 4423047889e1..3c269457776e 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -144,7 +144,7 @@ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv) u32 down, up; char gw_mode; - gw_mode = atomic_read(&bat_priv->gw_mode); + gw_mode = atomic_read(&bat_priv->gw.mode); switch (gw_mode) { case BATADV_GW_MODE_OFF: @@ -241,8 +241,8 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /* restart gateway selection if fast or late switching was enabled */ if ((gateway.bandwidth_down != 0) && - (atomic_read(&bat_priv->gw_mode) == BATADV_GW_MODE_CLIENT) && - (atomic_read(&bat_priv->gw_sel_class) > 2)) + (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT) && + (atomic_read(&bat_priv->gw.sel_class) > 2)) batadv_gw_check_election(bat_priv, orig); } diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 343d2c904399..81665b159a41 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -255,7 +255,7 @@ static int batadv_interface_tx(struct sk_buff *skb, if (batadv_compare_eth(ethhdr->h_dest, ectp_addr)) goto dropped; - gw_mode = atomic_read(&bat_priv->gw_mode); + gw_mode = atomic_read(&bat_priv->gw.mode); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* if gw mode is off, broadcast every packet */ if (gw_mode == BATADV_GW_MODE_OFF) { @@ -815,8 +815,8 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); #endif - atomic_set(&bat_priv->gw_mode, BATADV_GW_MODE_OFF); - atomic_set(&bat_priv->gw_sel_class, 20); + atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); + atomic_set(&bat_priv->gw.sel_class, 20); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index ef5832f4aaba..233abcf33c03 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -427,7 +427,7 @@ static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr, struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); int bytes_written; - switch (atomic_read(&bat_priv->gw_mode)) { + switch (atomic_read(&bat_priv->gw.mode)) { case BATADV_GW_MODE_CLIENT: bytes_written = sprintf(buff, "%s\n", BATADV_GW_MODE_CLIENT_NAME); @@ -476,10 +476,10 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj, return -EINVAL; } - if (atomic_read(&bat_priv->gw_mode) == gw_mode_tmp) + if (atomic_read(&bat_priv->gw.mode) == gw_mode_tmp) return count; - switch (atomic_read(&bat_priv->gw_mode)) { + switch (atomic_read(&bat_priv->gw.mode)) { case BATADV_GW_MODE_CLIENT: curr_gw_mode_str = BATADV_GW_MODE_CLIENT_NAME; break; @@ -508,7 +508,7 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj, * state */ batadv_gw_check_client_stop(bat_priv); - atomic_set(&bat_priv->gw_mode, (unsigned int)gw_mode_tmp); + atomic_set(&bat_priv->gw.mode, (unsigned int)gw_mode_tmp); batadv_gw_tvlv_container_update(bat_priv); return count; } @@ -624,7 +624,7 @@ BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR, 2 * BATADV_JITTER, INT_MAX, NULL); BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0, BATADV_TQ_MAX_VALUE, NULL); -BATADV_ATTR_SIF_UINT(gw_sel_class, gw_sel_class, S_IRUGO | S_IWUSR, 1, +BATADV_ATTR_SIF_UINT(gw_sel_class, gw.sel_class, S_IRUGO | S_IWUSR, 1, BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect); static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth, batadv_store_gw_bwidth); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index b70b6ae5edae..32c6d0e42fde 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -707,6 +707,8 @@ struct batadv_priv_debug_log { * @list: list of available gateway nodes * @list_lock: lock protecting gw_list & curr_gw * @curr_gw: pointer to currently selected gateway node + * @mode: gateway operation: off, client or server (see batadv_gw_modes) + * @sel_class: gateway selection class (applies if gw_mode client) * @bandwidth_down: advertised uplink download bandwidth (if gw_mode server) * @bandwidth_up: advertised uplink upload bandwidth (if gw_mode server) * @reselect: bool indicating a gateway re-selection is in progress @@ -715,6 +717,8 @@ struct batadv_priv_gw { struct hlist_head list; spinlock_t list_lock; /* protects gw_list & curr_gw */ struct batadv_gw_node __rcu *curr_gw; /* rcu protected pointer */ + atomic_t mode; + atomic_t sel_class; atomic_t bandwidth_down; atomic_t bandwidth_up; atomic_t reselect; @@ -865,8 +869,6 @@ struct batadv_priv_bat_v { * enabled * @multicast_mode: Enable or disable multicast optimizations on this node's * sender/originating side - * @gw_mode: gateway operation: off, client or server (see batadv_gw_modes) - * @gw_sel_class: gateway selection class (applies if gw_mode client) * @orig_interval: OGM broadcast interval in milliseconds * @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop * @log_level: configured log level (see batadv_dbg_level) @@ -922,8 +924,6 @@ struct batadv_priv { #ifdef CONFIG_BATMAN_ADV_MCAST atomic_t multicast_mode; #endif - atomic_t gw_mode; - atomic_t gw_sel_class; atomic_t orig_interval; atomic_t hop_penalty; #ifdef CONFIG_BATMAN_ADV_DEBUG -- cgit v1.2.3 From 7db682d1c39b2198a9c9d0bee5812d9c4329123d Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Tue, 10 May 2016 22:31:59 +0800 Subject: batman-adv: init ELP tweaking options only once The ELP interval and throughput override interface settings are initialized with default settings on every time an interface is added to a mesh. This patch prevents this behavior by moving the configuration init to the interface detection routine which runs only once per interface. Signed-off-by: Marek Lindner [a@unstable.cc: move initialization to batadv_v_hardif_init] Signed-off-by: Antonio Quartulli Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_algo.h | 7 ++++++- net/batman-adv/bat_v.c | 19 ++++++++++++++----- net/batman-adv/bat_v_elp.c | 1 - net/batman-adv/hard-interface.c | 3 +++ 4 files changed, 23 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 03dafd33d23b..36542962de7d 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -18,13 +18,14 @@ #ifndef _NET_BATMAN_ADV_BAT_ALGO_H_ #define _NET_BATMAN_ADV_BAT_ALGO_H_ -struct batadv_priv; +#include "main.h" int batadv_iv_init(void); #ifdef CONFIG_BATMAN_ADV_BATMAN_V int batadv_v_init(void); +void batadv_v_hardif_init(struct batadv_hard_iface *hardif); int batadv_v_mesh_init(struct batadv_priv *bat_priv); void batadv_v_mesh_free(struct batadv_priv *bat_priv); @@ -35,6 +36,10 @@ static inline int batadv_v_init(void) return 0; } +static inline void batadv_v_hardif_init(struct batadv_hard_iface *hardif) +{ +} + static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv) { return 0; diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index c16cd44a3b4c..c2fea812fb48 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -70,11 +70,6 @@ static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface) if (ret < 0) batadv_v_elp_iface_disable(hard_iface); - /* enable link throughput auto-detection by setting the throughput - * override to zero - */ - atomic_set(&hard_iface->bat_v.throughput_override, 0); - return ret; } @@ -338,6 +333,20 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = { .bat_neigh_print = batadv_v_neigh_print, }; +/** + * batadv_v_hardif_init - initialize the algorithm specific fields in the + * hard-interface object + * @hard_iface: the hard-interface to initialize + */ +void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) +{ + /* enable link throughput auto-detection by setting the throughput + * override to zero + */ + atomic_set(&hard_iface->bat_v.throughput_override, 0); + atomic_set(&hard_iface->bat_v.elp_interval, 500); +} + /** * batadv_v_mesh_init - initialize the B.A.T.M.A.N. V private resources for a * mesh diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 8909d1eb9622..cf0262becd08 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -344,7 +344,6 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&hard_iface->bat_v.elp_seqno, random_seqno); - atomic_set(&hard_iface->bat_v.elp_interval, 500); /* assume full-duplex by default */ hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX; diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index db2009d84a25..3696929e5692 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -37,6 +37,7 @@ #include #include +#include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "debugfs.h" #include "distributed-arp-table.h" @@ -683,6 +684,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (batadv_is_wifi_netdev(net_dev)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; + batadv_v_hardif_init(hard_iface); + /* extra reference for return */ kref_init(&hard_iface->refcount); kref_get(&hard_iface->refcount); -- cgit v1.2.3 From 1914848e0d641f8d6173bb53b6a4a392468ac725 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 9 May 2016 20:03:35 +0200 Subject: batman-adv: Set skb priority in fragments BATMAN will set the skb->priority based on the IP precedence or 802.1q tag. However, if it needs to fragment the frame, it currently leaves the fragment skb with the default priority and actually overwrites the priority in the unfragmented frame. Fix this. Signed-off-by: Andrew Lunn Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 65536db1bff7..a119b6a6365f 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -414,7 +413,7 @@ static struct sk_buff *batadv_frag_create(struct sk_buff *skb, if (!skb_fragment) goto err; - skb->priority = TC_PRIO_CONTROL; + skb_fragment->priority = skb->priority; /* Eat the last mtu-bytes of the skb */ skb_reserve(skb_fragment, header_size + ETH_HLEN); -- cgit v1.2.3 From 67a5613ed09e1b9d5d188507bcbaeb37a6d0fe12 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 15 May 2016 11:07:41 +0200 Subject: batman-adv: Include main.h in bat_v_ogm.h main.h includes statements which (re)define preprocessor variables which influence the compiled code. This makes it necessary to include it in all files. For example, it redefines pr_fmt used to the module as prefix for each pr_* message. Reported-by: Antonio Quartulli Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_ogm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index d849c75ada0e..4c4d45caa422 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -18,10 +18,10 @@ #ifndef _BATMAN_ADV_BATADV_V_OGM_H_ #define _BATMAN_ADV_BATADV_V_OGM_H_ +#include "main.h" + #include -struct batadv_hard_iface; -struct batadv_priv; struct sk_buff; int batadv_v_ogm_init(struct batadv_priv *bat_priv); -- cgit v1.2.3 From c0f25c802b3300b28d1e67c58c702d29555838de Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 9 May 2016 20:03:36 +0200 Subject: batman-adv: Include frame priority in fragment header Unfragmented frames which traverse a node have their skb->priority set by looking at the IP ToS byte, or the 802.1p header. However for fragments this is not possible, only one of the fragments will contain the headers. Instead, place the priority into the fragment header and on receiving a fragment, use this information to set the skb->priority for when the fragment is forwarded. Signed-off-by: Andrew Lunn Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 9 +++++++++ net/batman-adv/packet.h | 7 +++++-- net/batman-adv/routing.c | 2 ++ 3 files changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index a119b6a6365f..9f41a0a0d6ab 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -472,6 +472,15 @@ bool batadv_frag_send_packet(struct sk_buff *skb, frag_header.reserved = 0; frag_header.no = 0; frag_header.total_size = htons(skb->len); + + /* skb->priority values from 256->263 are magic values to + * directly indicate a specific 802.1d priority. This is used + * to allow 802.1d priority to be passed directly in from VLAN + * tags, etc. + */ + if (skb->priority >= 256 && skb->priority <= 263) + frag_header.priority = skb->priority - 256; + ether_addr_copy(frag_header.orig, primary_if->net_dev->dev_addr); ether_addr_copy(frag_header.dest, orig_node->orig); diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 372128ddb474..71567794df17 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -420,6 +420,7 @@ struct batadv_unicast_4addr_packet { * @dest: final destination used when routing fragments * @orig: originator of the fragment used when merging the packet * @no: fragment number within this sequence + * @priority: priority of frame, from ToS IP precedence or 802.1p * @reserved: reserved byte for alignment * @seqno: sequence identification * @total_size: size of the merged packet @@ -430,9 +431,11 @@ struct batadv_frag_packet { u8 ttl; #if defined(__BIG_ENDIAN_BITFIELD) u8 no:4; - u8 reserved:4; + u8 priority:3; + u8 reserved:1; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 reserved:4; + u8 reserved:1; + u8 priority:3; u8 no:4; #else #error "unknown bitfield endianness" diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index f75091c983ee..24fc75335b31 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1006,6 +1006,8 @@ int batadv_recv_frag_packet(struct sk_buff *skb, if (!orig_node_src) goto out; + skb->priority = frag_packet->priority + 256; + /* Route the fragment if it is not for us and too big to be merged. */ if (!batadv_is_my_mac(bat_priv, frag_packet->dest) && batadv_frag_skb_fwd(skb, recv_if, orig_node_src)) { -- cgit v1.2.3 From fcafa5e74b42a182a5bcc5c7f94ca026d4e5f06e Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 15 May 2016 11:07:42 +0200 Subject: batman-adv: Keep includes ordered by filename It is easier to detect if a include is already there for a used functionality when the includes are ordered. Using an alphabetic order together with the grouping in commit 1e2c2a4fe4a5 ("batman-adv: Add required includes to all files") makes includes better manageable. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 2 +- net/batman-adv/bat_v_elp.h | 4 ++-- net/batman-adv/gateway_common.c | 2 +- net/batman-adv/hard-interface.c | 2 +- net/batman-adv/main.h | 6 +++--- net/batman-adv/send.c | 2 +- net/batman-adv/sysfs.c | 4 ++-- 7 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 4815db978c27..6dc89b0eb01a 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -31,8 +31,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index cc130b2d05e5..be17c0b1369e 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -15,11 +15,11 @@ * along with this program; if not, see . */ -#include "main.h" - #ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_ #define _NET_BATMAN_ADV_BAT_V_ELP_H_ +#include "main.h" + struct sk_buff; struct work_struct; diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 3c269457776e..7754435611eb 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -19,8 +19,8 @@ #include "main.h" #include -#include #include +#include #include #include #include diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 3696929e5692..a3483f60c3a1 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -23,9 +23,9 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index cd83e2824f70..38f9e5523190 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -181,12 +181,12 @@ enum batadv_uev_type { #include #include #include /* for packet.h */ +#include +#include #include +#include #include #include -#include -#include -#include #include "types.h" diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 59e695b5cfbd..4e49454dfed4 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -22,8 +22,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 233abcf33c03..6244a9a336d0 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -25,8 +25,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -38,10 +38,10 @@ #include #include +#include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "gateway_common.h" -#include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "network-coding.h" #include "packet.h" -- cgit v1.2.3 From bd2a979e53fd9dd64b7e27553a23001d53201005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Tue, 10 May 2016 18:41:24 +0200 Subject: batman-adv: Always flood IGMP/MLD reports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this patch IGMP or MLD reports are always flooded. This is necessary for the upcoming bridge integration to function without multicast packet loss. With the report handling so far bridges might miss interested multicast listeners, leading to wrongly excluding ports from multicast packet forwarding. Currently we are treating IGMP/MLD reports, the messages bridges use to learn about interested multicast listeners, just as any other multicast packet: We try to send them to nodes matching its multicast destination. Unfortunately, the destination address of reports of the older IGMPv2/MLDv1 protocol families do not strictly adhere to their own protocol: More precisely, the interested receiver, an IGMPv2 or MLDv1 querier, itself usually does not listen to the multicast destination address of any reports. Therefore with this patch we are simply excluding IGMP/MLD reports from the multicast forwarding code path and keep flooding them. By that any bridge receives them and can properly learn about listeners. To avoid compatibility issues with older nodes not yet implementing this report handling, we need to force them to flood reports: We do this by bumping the multicast TVLV version to 2, effectively disabling their multicast optimization. Tested-by: Simon Wunderlich Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/Kconfig | 2 +- net/batman-adv/multicast.c | 87 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 75 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index f66930ee3c0b..b7ba97dbf4a9 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -66,7 +66,7 @@ config BATMAN_ADV_NC config BATMAN_ADV_MCAST bool "Multicast optimisation" - depends on BATMAN_ADV + depends on BATMAN_ADV && INET default n help This option enables the multicast optimisation which aims to diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index c32f24fafe67..4673328def29 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -25,9 +25,11 @@ #include #include #include +#include #include -#include +#include #include +#include #include #include #include @@ -236,7 +238,7 @@ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) if (batadv_mcast_has_bridge(bat_priv)) { if (bat_priv->mcast.enabled) { batadv_tvlv_container_unregister(bat_priv, - BATADV_TVLV_MCAST, 1); + BATADV_TVLV_MCAST, 2); bat_priv->mcast.enabled = false; } @@ -245,7 +247,7 @@ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) if (!bat_priv->mcast.enabled || mcast_data.flags != bat_priv->mcast.flags) { - batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 1, + batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, &mcast_data, sizeof(mcast_data)); bat_priv->mcast.flags = mcast_data.flags; bat_priv->mcast.enabled = true; @@ -282,6 +284,31 @@ out: batadv_mcast_mla_list_free(bat_priv, &mcast_list); } +/** + * batadv_mcast_is_report_ipv4 - check for IGMP reports + * @skb: the ethernet frame destined for the mesh + * + * This call might reallocate skb data. + * + * Checks whether the given frame is a valid IGMP report. + * + * Return: If so then true, otherwise false. + */ +static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) +{ + if (ip_mc_check_igmp(skb, NULL) < 0) + return false; + + switch (igmp_hdr(skb)->type) { + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + case IGMPV3_HOST_MEMBERSHIP_REPORT: + return true; + } + + return false; +} + /** * batadv_mcast_forw_mode_check_ipv4 - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information @@ -304,6 +331,9 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr))) return -ENOMEM; + if (batadv_mcast_is_report_ipv4(skb)) + return -EINVAL; + iphdr = ip_hdr(skb); /* TODO: Implement Multicast Router Discovery (RFC4286), @@ -320,6 +350,31 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, return 0; } +#if IS_ENABLED(CONFIG_IPV6) +/** + * batadv_mcast_is_report_ipv6 - check for MLD reports + * @skb: the ethernet frame destined for the mesh + * + * This call might reallocate skb data. + * + * Checks whether the given frame is a valid MLD report. + * + * Return: If so then true, otherwise false. + */ +static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) +{ + if (ipv6_mc_check_mld(skb, NULL) < 0) + return false; + + switch (icmp6_hdr(skb)->icmp6_type) { + case ICMPV6_MGM_REPORT: + case ICMPV6_MLD2_REPORT: + return true; + } + + return false; +} + /** * batadv_mcast_forw_mode_check_ipv6 - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information @@ -341,6 +396,9 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr))) return -ENOMEM; + if (batadv_mcast_is_report_ipv6(skb)) + return -EINVAL; + ip6hdr = ipv6_hdr(skb); /* TODO: Implement Multicast Router Discovery (RFC4286), @@ -357,6 +415,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, return 0; } +#endif /** * batadv_mcast_forw_mode_check - check for optimized forwarding potential @@ -385,9 +444,11 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, is_unsnoopable); +#if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, is_unsnoopable); +#endif default: return -EINVAL; } @@ -728,18 +789,18 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, } /** - * batadv_mcast_tvlv_ogm_handler_v1 - process incoming multicast tvlv container + * batadv_mcast_tvlv_ogm_handler - process incoming multicast tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the multicast data * @tvlv_value_len: tvlv buffer length */ -static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig, - u8 flags, - void *tvlv_value, - u16 tvlv_value_len) +static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 flags, + void *tvlv_value, + u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); u8 mcast_flags = BATADV_NO_FLAGS; @@ -789,8 +850,8 @@ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, */ void batadv_mcast_init(struct batadv_priv *bat_priv) { - batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler_v1, - NULL, BATADV_TVLV_MCAST, 1, + batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, + NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); } @@ -800,8 +861,8 @@ void batadv_mcast_init(struct batadv_priv *bat_priv) */ void batadv_mcast_free(struct batadv_priv *bat_priv) { - batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 1); - batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 1); + batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2); + batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2); spin_lock_bh(&bat_priv->tt.commit_lock); batadv_mcast_mla_tt_retract(bat_priv, NULL); -- cgit v1.2.3 From 1f8dce4992d03fc15cfbaf67cd09f0d1648c4606 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Sun, 15 May 2016 11:07:43 +0200 Subject: batman-adv: split tvlv into a separate file The tvlv functionality in main.c is mostly unrelated to the rest of the content. It still takes up a large portion of this source file (~45%, 588 lines). Moving it to a separate file makes it better visible as a main component of the batman-adv implementation and hides it less in the other helper functions in main.c Signed-off-by: Markus Pargmann [sven@narfation.org: fix conflicts with current version, fix includes, rewrote commit message] Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/Makefile | 1 + net/batman-adv/bat_iv_ogm.c | 1 + net/batman-adv/bat_v_ogm.c | 1 + net/batman-adv/distributed-arp-table.c | 1 + net/batman-adv/gateway_common.c | 1 + net/batman-adv/main.c | 591 ------------------------------- net/batman-adv/main.h | 34 -- net/batman-adv/multicast.c | 1 + net/batman-adv/network-coding.c | 1 + net/batman-adv/routing.c | 1 + net/batman-adv/translation-table.c | 1 + net/batman-adv/tvlv.c | 630 +++++++++++++++++++++++++++++++++ net/batman-adv/tvlv.h | 61 ++++ 13 files changed, 700 insertions(+), 625 deletions(-) create mode 100644 net/batman-adv/tvlv.c create mode 100644 net/batman-adv/tvlv.h (limited to 'net') diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 797cf2fc88c1..5c6ece0cfc17 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -40,3 +40,4 @@ batman-adv-y += send.o batman-adv-y += soft-interface.o batman-adv-y += sysfs.o batman-adv-y += translation-table.o +batman-adv-y += tvlv.o diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 6dc89b0eb01a..948a5b45474d 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -58,6 +58,7 @@ #include "routing.h" #include "send.h" #include "translation-table.h" +#include "tvlv.h" static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work); diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 23ea9bfb9f67..ca5a679d112f 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -46,6 +46,7 @@ #include "routing.h" #include "send.h" #include "translation-table.h" +#include "tvlv.h" /** * batadv_v_ogm_orig_get - retrieve and possibly create an originator node diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 278800a99c69..584b82744699 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -48,6 +48,7 @@ #include "originator.h" #include "send.h" #include "translation-table.h" +#include "tvlv.h" static void batadv_dat_purge(struct work_struct *work); diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 7754435611eb..6a6f2d4987e5 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -29,6 +29,7 @@ #include "gateway_client.h" #include "packet.h" +#include "tvlv.h" /** * batadv_parse_throughput - parse supplied string buffer to extract throughput diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 627d14ececaf..225d63e0c711 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -31,16 +31,13 @@ #include #include #include -#include #include #include #include -#include #include #include #include #include -#include #include #include #include @@ -641,594 +638,6 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) return htonl(crc); } -/** - * batadv_tvlv_handler_release - release tvlv handler from lists and queue for - * free after rcu grace period - * @ref: kref pointer of the tvlv - */ -static void batadv_tvlv_handler_release(struct kref *ref) -{ - struct batadv_tvlv_handler *tvlv_handler; - - tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount); - kfree_rcu(tvlv_handler, rcu); -} - -/** - * batadv_tvlv_handler_put - decrement the tvlv container refcounter and - * possibly release it - * @tvlv_handler: the tvlv handler to free - */ -static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) -{ - kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); -} - -/** - * batadv_tvlv_handler_get - retrieve tvlv handler from the tvlv handler list - * based on the provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information - * @type: tvlv handler type to look for - * @version: tvlv handler version to look for - * - * Return: tvlv handler if found or NULL otherwise. - */ -static struct batadv_tvlv_handler * -batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) -{ - struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tvlv_handler_tmp, - &bat_priv->tvlv.handler_list, list) { - if (tvlv_handler_tmp->type != type) - continue; - - if (tvlv_handler_tmp->version != version) - continue; - - if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount)) - continue; - - tvlv_handler = tvlv_handler_tmp; - break; - } - rcu_read_unlock(); - - return tvlv_handler; -} - -/** - * batadv_tvlv_container_release - release tvlv from lists and free - * @ref: kref pointer of the tvlv - */ -static void batadv_tvlv_container_release(struct kref *ref) -{ - struct batadv_tvlv_container *tvlv; - - tvlv = container_of(ref, struct batadv_tvlv_container, refcount); - kfree(tvlv); -} - -/** - * batadv_tvlv_container_put - decrement the tvlv container refcounter and - * possibly release it - * @tvlv: the tvlv container to free - */ -static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) -{ - kref_put(&tvlv->refcount, batadv_tvlv_container_release); -} - -/** - * batadv_tvlv_container_get - retrieve tvlv container from the tvlv container - * list based on the provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information - * @type: tvlv container type to look for - * @version: tvlv container version to look for - * - * Has to be called with the appropriate locks being acquired - * (tvlv.container_list_lock). - * - * Return: tvlv container if found or NULL otherwise. - */ -static struct batadv_tvlv_container * -batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) -{ - struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; - - lockdep_assert_held(&bat_priv->tvlv.container_list_lock); - - hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) { - if (tvlv_tmp->tvlv_hdr.type != type) - continue; - - if (tvlv_tmp->tvlv_hdr.version != version) - continue; - - kref_get(&tvlv_tmp->refcount); - tvlv = tvlv_tmp; - break; - } - - return tvlv; -} - -/** - * batadv_tvlv_container_list_size - calculate the size of the tvlv container - * list entries - * @bat_priv: the bat priv with all the soft interface information - * - * Has to be called with the appropriate locks being acquired - * (tvlv.container_list_lock). - * - * Return: size of all currently registered tvlv containers in bytes. - */ -static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) -{ - struct batadv_tvlv_container *tvlv; - u16 tvlv_len = 0; - - lockdep_assert_held(&bat_priv->tvlv.container_list_lock); - - hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { - tvlv_len += sizeof(struct batadv_tvlv_hdr); - tvlv_len += ntohs(tvlv->tvlv_hdr.len); - } - - return tvlv_len; -} - -/** - * batadv_tvlv_container_remove - remove tvlv container from the tvlv container - * list - * @bat_priv: the bat priv with all the soft interface information - * @tvlv: the to be removed tvlv container - * - * Has to be called with the appropriate locks being acquired - * (tvlv.container_list_lock). - */ -static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, - struct batadv_tvlv_container *tvlv) -{ - lockdep_assert_held(&bat_priv->tvlv.container_list_lock); - - if (!tvlv) - return; - - hlist_del(&tvlv->list); - - /* first call to decrement the counter, second call to free */ - batadv_tvlv_container_put(tvlv); - batadv_tvlv_container_put(tvlv); -} - -/** - * batadv_tvlv_container_unregister - unregister tvlv container based on the - * provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information - * @type: tvlv container type to unregister - * @version: tvlv container type to unregister - */ -void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, - u8 type, u8 version) -{ - struct batadv_tvlv_container *tvlv; - - spin_lock_bh(&bat_priv->tvlv.container_list_lock); - tvlv = batadv_tvlv_container_get(bat_priv, type, version); - batadv_tvlv_container_remove(bat_priv, tvlv); - spin_unlock_bh(&bat_priv->tvlv.container_list_lock); -} - -/** - * batadv_tvlv_container_register - register tvlv type, version and content - * to be propagated with each (primary interface) OGM - * @bat_priv: the bat priv with all the soft interface information - * @type: tvlv container type - * @version: tvlv container version - * @tvlv_value: tvlv container content - * @tvlv_value_len: tvlv container content length - * - * If a container of the same type and version was already registered the new - * content is going to replace the old one. - */ -void batadv_tvlv_container_register(struct batadv_priv *bat_priv, - u8 type, u8 version, - void *tvlv_value, u16 tvlv_value_len) -{ - struct batadv_tvlv_container *tvlv_old, *tvlv_new; - - if (!tvlv_value) - tvlv_value_len = 0; - - tvlv_new = kzalloc(sizeof(*tvlv_new) + tvlv_value_len, GFP_ATOMIC); - if (!tvlv_new) - return; - - tvlv_new->tvlv_hdr.version = version; - tvlv_new->tvlv_hdr.type = type; - tvlv_new->tvlv_hdr.len = htons(tvlv_value_len); - - memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len)); - INIT_HLIST_NODE(&tvlv_new->list); - kref_init(&tvlv_new->refcount); - - spin_lock_bh(&bat_priv->tvlv.container_list_lock); - tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); - batadv_tvlv_container_remove(bat_priv, tvlv_old); - hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); - spin_unlock_bh(&bat_priv->tvlv.container_list_lock); -} - -/** - * batadv_tvlv_realloc_packet_buff - reallocate packet buffer to accommodate - * requested packet size - * @packet_buff: packet buffer - * @packet_buff_len: packet buffer size - * @min_packet_len: requested packet minimum size - * @additional_packet_len: requested additional packet size on top of minimum - * size - * - * Return: true of the packet buffer could be changed to the requested size, - * false otherwise. - */ -static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, - int *packet_buff_len, - int min_packet_len, - int additional_packet_len) -{ - unsigned char *new_buff; - - new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC); - - /* keep old buffer if kmalloc should fail */ - if (!new_buff) - return false; - - memcpy(new_buff, *packet_buff, min_packet_len); - kfree(*packet_buff); - *packet_buff = new_buff; - *packet_buff_len = min_packet_len + additional_packet_len; - - return true; -} - -/** - * batadv_tvlv_container_ogm_append - append tvlv container content to given - * OGM packet buffer - * @bat_priv: the bat priv with all the soft interface information - * @packet_buff: ogm packet buffer - * @packet_buff_len: ogm packet buffer size including ogm header and tvlv - * content - * @packet_min_len: ogm header size to be preserved for the OGM itself - * - * The ogm packet might be enlarged or shrunk depending on the current size - * and the size of the to-be-appended tvlv containers. - * - * Return: size of all appended tvlv containers in bytes. - */ -u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, - unsigned char **packet_buff, - int *packet_buff_len, int packet_min_len) -{ - struct batadv_tvlv_container *tvlv; - struct batadv_tvlv_hdr *tvlv_hdr; - u16 tvlv_value_len; - void *tvlv_value; - bool ret; - - spin_lock_bh(&bat_priv->tvlv.container_list_lock); - tvlv_value_len = batadv_tvlv_container_list_size(bat_priv); - - ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len, - packet_min_len, tvlv_value_len); - - if (!ret) - goto end; - - if (!tvlv_value_len) - goto end; - - tvlv_value = (*packet_buff) + packet_min_len; - - hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { - tvlv_hdr = tvlv_value; - tvlv_hdr->type = tvlv->tvlv_hdr.type; - tvlv_hdr->version = tvlv->tvlv_hdr.version; - tvlv_hdr->len = tvlv->tvlv_hdr.len; - tvlv_value = tvlv_hdr + 1; - memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len)); - tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len); - } - -end: - spin_unlock_bh(&bat_priv->tvlv.container_list_lock); - return tvlv_value_len; -} - -/** - * batadv_tvlv_call_handler - parse the given tvlv buffer to call the - * appropriate handlers - * @bat_priv: the bat priv with all the soft interface information - * @tvlv_handler: tvlv callback function handling the tvlv content - * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet - * @orig_node: orig node emitting the ogm packet - * @src: source mac address of the unicast packet - * @dst: destination mac address of the unicast packet - * @tvlv_value: tvlv content - * @tvlv_value_len: tvlv content length - * - * Return: success if handler was not found or the return value of the handler - * callback. - */ -static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, - struct batadv_tvlv_handler *tvlv_handler, - bool ogm_source, - struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_value, u16 tvlv_value_len) -{ - if (!tvlv_handler) - return NET_RX_SUCCESS; - - if (ogm_source) { - if (!tvlv_handler->ogm_handler) - return NET_RX_SUCCESS; - - if (!orig_node) - return NET_RX_SUCCESS; - - tvlv_handler->ogm_handler(bat_priv, orig_node, - BATADV_NO_FLAGS, - tvlv_value, tvlv_value_len); - tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED; - } else { - if (!src) - return NET_RX_SUCCESS; - - if (!dst) - return NET_RX_SUCCESS; - - if (!tvlv_handler->unicast_handler) - return NET_RX_SUCCESS; - - return tvlv_handler->unicast_handler(bat_priv, src, - dst, tvlv_value, - tvlv_value_len); - } - - return NET_RX_SUCCESS; -} - -/** - * batadv_tvlv_containers_process - parse the given tvlv buffer to call the - * appropriate handlers - * @bat_priv: the bat priv with all the soft interface information - * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet - * @orig_node: orig node emitting the ogm packet - * @src: source mac address of the unicast packet - * @dst: destination mac address of the unicast packet - * @tvlv_value: tvlv content - * @tvlv_value_len: tvlv content length - * - * Return: success when processing an OGM or the return value of all called - * handler callbacks. - */ -int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, - bool ogm_source, - struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_value, u16 tvlv_value_len) -{ - struct batadv_tvlv_handler *tvlv_handler; - struct batadv_tvlv_hdr *tvlv_hdr; - u16 tvlv_value_cont_len; - u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND; - int ret = NET_RX_SUCCESS; - - while (tvlv_value_len >= sizeof(*tvlv_hdr)) { - tvlv_hdr = tvlv_value; - tvlv_value_cont_len = ntohs(tvlv_hdr->len); - tvlv_value = tvlv_hdr + 1; - tvlv_value_len -= sizeof(*tvlv_hdr); - - if (tvlv_value_cont_len > tvlv_value_len) - break; - - tvlv_handler = batadv_tvlv_handler_get(bat_priv, - tvlv_hdr->type, - tvlv_hdr->version); - - ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler, - ogm_source, orig_node, - src, dst, tvlv_value, - tvlv_value_cont_len); - if (tvlv_handler) - batadv_tvlv_handler_put(tvlv_handler); - tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; - tvlv_value_len -= tvlv_value_cont_len; - } - - if (!ogm_source) - return ret; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tvlv_handler, - &bat_priv->tvlv.handler_list, list) { - if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) && - !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED)) - tvlv_handler->ogm_handler(bat_priv, orig_node, - cifnotfound, NULL, 0); - - tvlv_handler->flags &= ~BATADV_TVLV_HANDLER_OGM_CALLED; - } - rcu_read_unlock(); - - return NET_RX_SUCCESS; -} - -/** - * batadv_tvlv_ogm_receive - process an incoming ogm and call the appropriate - * handlers - * @bat_priv: the bat priv with all the soft interface information - * @batadv_ogm_packet: ogm packet containing the tvlv containers - * @orig_node: orig node emitting the ogm packet - */ -void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, - struct batadv_ogm_packet *batadv_ogm_packet, - struct batadv_orig_node *orig_node) -{ - void *tvlv_value; - u16 tvlv_value_len; - - if (!batadv_ogm_packet) - return; - - tvlv_value_len = ntohs(batadv_ogm_packet->tvlv_len); - if (!tvlv_value_len) - return; - - tvlv_value = batadv_ogm_packet + 1; - - batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL, - tvlv_value, tvlv_value_len); -} - -/** - * batadv_tvlv_handler_register - register tvlv handler based on the provided - * type and version (both need to match) for ogm tvlv payload and/or unicast - * payload - * @bat_priv: the bat priv with all the soft interface information - * @optr: ogm tvlv handler callback function. This function receives the orig - * node, flags and the tvlv content as argument to process. - * @uptr: unicast tvlv handler callback function. This function receives the - * source & destination of the unicast packet as well as the tvlv content - * to process. - * @type: tvlv handler type to be registered - * @version: tvlv handler version to be registered - * @flags: flags to enable or disable TVLV API behavior - */ -void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, - void (*optr)(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig, - u8 flags, - void *tvlv_value, - u16 tvlv_value_len), - int (*uptr)(struct batadv_priv *bat_priv, - u8 *src, u8 *dst, - void *tvlv_value, - u16 tvlv_value_len), - u8 type, u8 version, u8 flags) -{ - struct batadv_tvlv_handler *tvlv_handler; - - tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); - if (tvlv_handler) { - batadv_tvlv_handler_put(tvlv_handler); - return; - } - - tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); - if (!tvlv_handler) - return; - - tvlv_handler->ogm_handler = optr; - tvlv_handler->unicast_handler = uptr; - tvlv_handler->type = type; - tvlv_handler->version = version; - tvlv_handler->flags = flags; - kref_init(&tvlv_handler->refcount); - INIT_HLIST_NODE(&tvlv_handler->list); - - spin_lock_bh(&bat_priv->tvlv.handler_list_lock); - hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); - spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); -} - -/** - * batadv_tvlv_handler_unregister - unregister tvlv handler based on the - * provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information - * @type: tvlv handler type to be unregistered - * @version: tvlv handler version to be unregistered - */ -void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, - u8 type, u8 version) -{ - struct batadv_tvlv_handler *tvlv_handler; - - tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); - if (!tvlv_handler) - return; - - batadv_tvlv_handler_put(tvlv_handler); - spin_lock_bh(&bat_priv->tvlv.handler_list_lock); - hlist_del_rcu(&tvlv_handler->list); - spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); - batadv_tvlv_handler_put(tvlv_handler); -} - -/** - * batadv_tvlv_unicast_send - send a unicast packet with tvlv payload to the - * specified host - * @bat_priv: the bat priv with all the soft interface information - * @src: source mac address of the unicast packet - * @dst: destination mac address of the unicast packet - * @type: tvlv type - * @version: tvlv version - * @tvlv_value: tvlv content - * @tvlv_value_len: tvlv content length - */ -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, - u8 *dst, u8 type, u8 version, - void *tvlv_value, u16 tvlv_value_len) -{ - struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; - struct batadv_tvlv_hdr *tvlv_hdr; - struct batadv_orig_node *orig_node; - struct sk_buff *skb; - unsigned char *tvlv_buff; - unsigned int tvlv_len; - ssize_t hdr_len = sizeof(*unicast_tvlv_packet); - - orig_node = batadv_orig_hash_find(bat_priv, dst); - if (!orig_node) - return; - - tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len; - - skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + hdr_len + tvlv_len); - if (!skb) - goto out; - - skb->priority = TC_PRIO_CONTROL; - skb_reserve(skb, ETH_HLEN); - tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len); - unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff; - unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV; - unicast_tvlv_packet->version = BATADV_COMPAT_VERSION; - unicast_tvlv_packet->ttl = BATADV_TTL; - unicast_tvlv_packet->reserved = 0; - unicast_tvlv_packet->tvlv_len = htons(tvlv_len); - unicast_tvlv_packet->align = 0; - ether_addr_copy(unicast_tvlv_packet->src, src); - ether_addr_copy(unicast_tvlv_packet->dst, dst); - - tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1); - tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff; - tvlv_hdr->version = version; - tvlv_hdr->type = type; - tvlv_hdr->len = htons(tvlv_value_len); - tvlv_buff += sizeof(*tvlv_hdr); - memcpy(tvlv_buff, tvlv_value, tvlv_value_len); - - if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP) - kfree_skb(skb); -out: - batadv_orig_node_put(orig_node); -} - /** * batadv_get_vid - extract the VLAN identifier from skb if any * @skb: the buffer containing the packet diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 38f9e5523190..3af6582aab8b 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -190,7 +190,6 @@ enum batadv_uev_type { #include "types.h" -struct batadv_ogm_packet; struct seq_file; struct sk_buff; @@ -372,39 +371,6 @@ static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) */ #define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0])) -void batadv_tvlv_container_register(struct batadv_priv *bat_priv, - u8 type, u8 version, - void *tvlv_value, u16 tvlv_value_len); -u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, - unsigned char **packet_buff, - int *packet_buff_len, int packet_min_len); -void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, - struct batadv_ogm_packet *batadv_ogm_packet, - struct batadv_orig_node *orig_node); -void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, - u8 type, u8 version); - -void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, - void (*optr)(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig, - u8 flags, - void *tvlv_value, - u16 tvlv_value_len), - int (*uptr)(struct batadv_priv *bat_priv, - u8 *src, u8 *dst, - void *tvlv_value, - u16 tvlv_value_len), - u8 type, u8 version, u8 flags); -void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, - u8 type, u8 version); -int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, - bool ogm_source, - struct batadv_orig_node *orig_node, - u8 *src, u8 *dst, - void *tvlv_buff, u16 tvlv_buff_len); -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, - u8 *dst, u8 type, u8 version, - void *tvlv_value, u16 tvlv_value_len); unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len); bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid); diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index d3222db60fd0..0e7d78f4f1b8 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -57,6 +57,7 @@ #include "hash.h" #include "packet.h" #include "translation-table.h" +#include "tvlv.h" /** * batadv_mcast_get_bridge - get the bridge on top of the softif if it exists diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 678f06865312..d0383dea6440 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -55,6 +55,7 @@ #include "packet.h" #include "routing.h" #include "send.h" +#include "tvlv.h" static struct lock_class_key batadv_nc_coding_hash_lock_class_key; static struct lock_class_key batadv_nc_decoding_hash_lock_class_key; diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 24fc75335b31..8cb459a57219 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -46,6 +46,7 @@ #include "send.h" #include "soft-interface.h" #include "translation-table.h" +#include "tvlv.h" static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 87bb2030186d..6c8d624e4581 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -51,6 +51,7 @@ #include "originator.h" #include "packet.h" #include "soft-interface.h" +#include "tvlv.h" /* hash class keys */ static struct lock_class_key batadv_tt_local_hash_lock_class_key; diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c new file mode 100644 index 000000000000..2fd542e0d6a8 --- /dev/null +++ b/net/batman-adv/tvlv.c @@ -0,0 +1,630 @@ +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "originator.h" +#include "packet.h" +#include "send.h" +#include "tvlv.h" + +/** + * batadv_tvlv_handler_release - release tvlv handler from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the tvlv + */ +static void batadv_tvlv_handler_release(struct kref *ref) +{ + struct batadv_tvlv_handler *tvlv_handler; + + tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount); + kfree_rcu(tvlv_handler, rcu); +} + +/** + * batadv_tvlv_handler_put - decrement the tvlv container refcounter and + * possibly release it + * @tvlv_handler: the tvlv handler to free + */ +static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) +{ + kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); +} + +/** + * batadv_tvlv_handler_get - retrieve tvlv handler from the tvlv handler list + * based on the provided type and version (both need to match) + * @bat_priv: the bat priv with all the soft interface information + * @type: tvlv handler type to look for + * @version: tvlv handler version to look for + * + * Return: tvlv handler if found or NULL otherwise. + */ +static struct batadv_tvlv_handler * +batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) +{ + struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tvlv_handler_tmp, + &bat_priv->tvlv.handler_list, list) { + if (tvlv_handler_tmp->type != type) + continue; + + if (tvlv_handler_tmp->version != version) + continue; + + if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount)) + continue; + + tvlv_handler = tvlv_handler_tmp; + break; + } + rcu_read_unlock(); + + return tvlv_handler; +} + +/** + * batadv_tvlv_container_release - release tvlv from lists and free + * @ref: kref pointer of the tvlv + */ +static void batadv_tvlv_container_release(struct kref *ref) +{ + struct batadv_tvlv_container *tvlv; + + tvlv = container_of(ref, struct batadv_tvlv_container, refcount); + kfree(tvlv); +} + +/** + * batadv_tvlv_container_put - decrement the tvlv container refcounter and + * possibly release it + * @tvlv: the tvlv container to free + */ +static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) +{ + kref_put(&tvlv->refcount, batadv_tvlv_container_release); +} + +/** + * batadv_tvlv_container_get - retrieve tvlv container from the tvlv container + * list based on the provided type and version (both need to match) + * @bat_priv: the bat priv with all the soft interface information + * @type: tvlv container type to look for + * @version: tvlv container version to look for + * + * Has to be called with the appropriate locks being acquired + * (tvlv.container_list_lock). + * + * Return: tvlv container if found or NULL otherwise. + */ +static struct batadv_tvlv_container * +batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) +{ + struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; + + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + + hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) { + if (tvlv_tmp->tvlv_hdr.type != type) + continue; + + if (tvlv_tmp->tvlv_hdr.version != version) + continue; + + kref_get(&tvlv_tmp->refcount); + tvlv = tvlv_tmp; + break; + } + + return tvlv; +} + +/** + * batadv_tvlv_container_list_size - calculate the size of the tvlv container + * list entries + * @bat_priv: the bat priv with all the soft interface information + * + * Has to be called with the appropriate locks being acquired + * (tvlv.container_list_lock). + * + * Return: size of all currently registered tvlv containers in bytes. + */ +static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) +{ + struct batadv_tvlv_container *tvlv; + u16 tvlv_len = 0; + + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + + hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { + tvlv_len += sizeof(struct batadv_tvlv_hdr); + tvlv_len += ntohs(tvlv->tvlv_hdr.len); + } + + return tvlv_len; +} + +/** + * batadv_tvlv_container_remove - remove tvlv container from the tvlv container + * list + * @bat_priv: the bat priv with all the soft interface information + * @tvlv: the to be removed tvlv container + * + * Has to be called with the appropriate locks being acquired + * (tvlv.container_list_lock). + */ +static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, + struct batadv_tvlv_container *tvlv) +{ + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + + if (!tvlv) + return; + + hlist_del(&tvlv->list); + + /* first call to decrement the counter, second call to free */ + batadv_tvlv_container_put(tvlv); + batadv_tvlv_container_put(tvlv); +} + +/** + * batadv_tvlv_container_unregister - unregister tvlv container based on the + * provided type and version (both need to match) + * @bat_priv: the bat priv with all the soft interface information + * @type: tvlv container type to unregister + * @version: tvlv container type to unregister + */ +void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, + u8 type, u8 version) +{ + struct batadv_tvlv_container *tvlv; + + spin_lock_bh(&bat_priv->tvlv.container_list_lock); + tvlv = batadv_tvlv_container_get(bat_priv, type, version); + batadv_tvlv_container_remove(bat_priv, tvlv); + spin_unlock_bh(&bat_priv->tvlv.container_list_lock); +} + +/** + * batadv_tvlv_container_register - register tvlv type, version and content + * to be propagated with each (primary interface) OGM + * @bat_priv: the bat priv with all the soft interface information + * @type: tvlv container type + * @version: tvlv container version + * @tvlv_value: tvlv container content + * @tvlv_value_len: tvlv container content length + * + * If a container of the same type and version was already registered the new + * content is going to replace the old one. + */ +void batadv_tvlv_container_register(struct batadv_priv *bat_priv, + u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len) +{ + struct batadv_tvlv_container *tvlv_old, *tvlv_new; + + if (!tvlv_value) + tvlv_value_len = 0; + + tvlv_new = kzalloc(sizeof(*tvlv_new) + tvlv_value_len, GFP_ATOMIC); + if (!tvlv_new) + return; + + tvlv_new->tvlv_hdr.version = version; + tvlv_new->tvlv_hdr.type = type; + tvlv_new->tvlv_hdr.len = htons(tvlv_value_len); + + memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len)); + INIT_HLIST_NODE(&tvlv_new->list); + kref_init(&tvlv_new->refcount); + + spin_lock_bh(&bat_priv->tvlv.container_list_lock); + tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); + batadv_tvlv_container_remove(bat_priv, tvlv_old); + hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); + spin_unlock_bh(&bat_priv->tvlv.container_list_lock); +} + +/** + * batadv_tvlv_realloc_packet_buff - reallocate packet buffer to accommodate + * requested packet size + * @packet_buff: packet buffer + * @packet_buff_len: packet buffer size + * @min_packet_len: requested packet minimum size + * @additional_packet_len: requested additional packet size on top of minimum + * size + * + * Return: true of the packet buffer could be changed to the requested size, + * false otherwise. + */ +static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, + int *packet_buff_len, + int min_packet_len, + int additional_packet_len) +{ + unsigned char *new_buff; + + new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC); + + /* keep old buffer if kmalloc should fail */ + if (!new_buff) + return false; + + memcpy(new_buff, *packet_buff, min_packet_len); + kfree(*packet_buff); + *packet_buff = new_buff; + *packet_buff_len = min_packet_len + additional_packet_len; + + return true; +} + +/** + * batadv_tvlv_container_ogm_append - append tvlv container content to given + * OGM packet buffer + * @bat_priv: the bat priv with all the soft interface information + * @packet_buff: ogm packet buffer + * @packet_buff_len: ogm packet buffer size including ogm header and tvlv + * content + * @packet_min_len: ogm header size to be preserved for the OGM itself + * + * The ogm packet might be enlarged or shrunk depending on the current size + * and the size of the to-be-appended tvlv containers. + * + * Return: size of all appended tvlv containers in bytes. + */ +u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, + unsigned char **packet_buff, + int *packet_buff_len, int packet_min_len) +{ + struct batadv_tvlv_container *tvlv; + struct batadv_tvlv_hdr *tvlv_hdr; + u16 tvlv_value_len; + void *tvlv_value; + bool ret; + + spin_lock_bh(&bat_priv->tvlv.container_list_lock); + tvlv_value_len = batadv_tvlv_container_list_size(bat_priv); + + ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len, + packet_min_len, tvlv_value_len); + + if (!ret) + goto end; + + if (!tvlv_value_len) + goto end; + + tvlv_value = (*packet_buff) + packet_min_len; + + hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { + tvlv_hdr = tvlv_value; + tvlv_hdr->type = tvlv->tvlv_hdr.type; + tvlv_hdr->version = tvlv->tvlv_hdr.version; + tvlv_hdr->len = tvlv->tvlv_hdr.len; + tvlv_value = tvlv_hdr + 1; + memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len)); + tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len); + } + +end: + spin_unlock_bh(&bat_priv->tvlv.container_list_lock); + return tvlv_value_len; +} + +/** + * batadv_tvlv_call_handler - parse the given tvlv buffer to call the + * appropriate handlers + * @bat_priv: the bat priv with all the soft interface information + * @tvlv_handler: tvlv callback function handling the tvlv content + * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet + * @orig_node: orig node emitting the ogm packet + * @src: source mac address of the unicast packet + * @dst: destination mac address of the unicast packet + * @tvlv_value: tvlv content + * @tvlv_value_len: tvlv content length + * + * Return: success if handler was not found or the return value of the handler + * callback. + */ +static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, + struct batadv_tvlv_handler *tvlv_handler, + bool ogm_source, + struct batadv_orig_node *orig_node, + u8 *src, u8 *dst, + void *tvlv_value, u16 tvlv_value_len) +{ + if (!tvlv_handler) + return NET_RX_SUCCESS; + + if (ogm_source) { + if (!tvlv_handler->ogm_handler) + return NET_RX_SUCCESS; + + if (!orig_node) + return NET_RX_SUCCESS; + + tvlv_handler->ogm_handler(bat_priv, orig_node, + BATADV_NO_FLAGS, + tvlv_value, tvlv_value_len); + tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED; + } else { + if (!src) + return NET_RX_SUCCESS; + + if (!dst) + return NET_RX_SUCCESS; + + if (!tvlv_handler->unicast_handler) + return NET_RX_SUCCESS; + + return tvlv_handler->unicast_handler(bat_priv, src, + dst, tvlv_value, + tvlv_value_len); + } + + return NET_RX_SUCCESS; +} + +/** + * batadv_tvlv_containers_process - parse the given tvlv buffer to call the + * appropriate handlers + * @bat_priv: the bat priv with all the soft interface information + * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet + * @orig_node: orig node emitting the ogm packet + * @src: source mac address of the unicast packet + * @dst: destination mac address of the unicast packet + * @tvlv_value: tvlv content + * @tvlv_value_len: tvlv content length + * + * Return: success when processing an OGM or the return value of all called + * handler callbacks. + */ +int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, + bool ogm_source, + struct batadv_orig_node *orig_node, + u8 *src, u8 *dst, + void *tvlv_value, u16 tvlv_value_len) +{ + struct batadv_tvlv_handler *tvlv_handler; + struct batadv_tvlv_hdr *tvlv_hdr; + u16 tvlv_value_cont_len; + u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND; + int ret = NET_RX_SUCCESS; + + while (tvlv_value_len >= sizeof(*tvlv_hdr)) { + tvlv_hdr = tvlv_value; + tvlv_value_cont_len = ntohs(tvlv_hdr->len); + tvlv_value = tvlv_hdr + 1; + tvlv_value_len -= sizeof(*tvlv_hdr); + + if (tvlv_value_cont_len > tvlv_value_len) + break; + + tvlv_handler = batadv_tvlv_handler_get(bat_priv, + tvlv_hdr->type, + tvlv_hdr->version); + + ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler, + ogm_source, orig_node, + src, dst, tvlv_value, + tvlv_value_cont_len); + if (tvlv_handler) + batadv_tvlv_handler_put(tvlv_handler); + tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; + tvlv_value_len -= tvlv_value_cont_len; + } + + if (!ogm_source) + return ret; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tvlv_handler, + &bat_priv->tvlv.handler_list, list) { + if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) && + !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED)) + tvlv_handler->ogm_handler(bat_priv, orig_node, + cifnotfound, NULL, 0); + + tvlv_handler->flags &= ~BATADV_TVLV_HANDLER_OGM_CALLED; + } + rcu_read_unlock(); + + return NET_RX_SUCCESS; +} + +/** + * batadv_tvlv_ogm_receive - process an incoming ogm and call the appropriate + * handlers + * @bat_priv: the bat priv with all the soft interface information + * @batadv_ogm_packet: ogm packet containing the tvlv containers + * @orig_node: orig node emitting the ogm packet + */ +void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, + struct batadv_ogm_packet *batadv_ogm_packet, + struct batadv_orig_node *orig_node) +{ + void *tvlv_value; + u16 tvlv_value_len; + + if (!batadv_ogm_packet) + return; + + tvlv_value_len = ntohs(batadv_ogm_packet->tvlv_len); + if (!tvlv_value_len) + return; + + tvlv_value = batadv_ogm_packet + 1; + + batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL, + tvlv_value, tvlv_value_len); +} + +/** + * batadv_tvlv_handler_register - register tvlv handler based on the provided + * type and version (both need to match) for ogm tvlv payload and/or unicast + * payload + * @bat_priv: the bat priv with all the soft interface information + * @optr: ogm tvlv handler callback function. This function receives the orig + * node, flags and the tvlv content as argument to process. + * @uptr: unicast tvlv handler callback function. This function receives the + * source & destination of the unicast packet as well as the tvlv content + * to process. + * @type: tvlv handler type to be registered + * @version: tvlv handler version to be registered + * @flags: flags to enable or disable TVLV API behavior + */ +void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, + void (*optr)(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 flags, + void *tvlv_value, + u16 tvlv_value_len), + int (*uptr)(struct batadv_priv *bat_priv, + u8 *src, u8 *dst, + void *tvlv_value, + u16 tvlv_value_len), + u8 type, u8 version, u8 flags) +{ + struct batadv_tvlv_handler *tvlv_handler; + + tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); + if (tvlv_handler) { + batadv_tvlv_handler_put(tvlv_handler); + return; + } + + tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); + if (!tvlv_handler) + return; + + tvlv_handler->ogm_handler = optr; + tvlv_handler->unicast_handler = uptr; + tvlv_handler->type = type; + tvlv_handler->version = version; + tvlv_handler->flags = flags; + kref_init(&tvlv_handler->refcount); + INIT_HLIST_NODE(&tvlv_handler->list); + + spin_lock_bh(&bat_priv->tvlv.handler_list_lock); + hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); +} + +/** + * batadv_tvlv_handler_unregister - unregister tvlv handler based on the + * provided type and version (both need to match) + * @bat_priv: the bat priv with all the soft interface information + * @type: tvlv handler type to be unregistered + * @version: tvlv handler version to be unregistered + */ +void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, + u8 type, u8 version) +{ + struct batadv_tvlv_handler *tvlv_handler; + + tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); + if (!tvlv_handler) + return; + + batadv_tvlv_handler_put(tvlv_handler); + spin_lock_bh(&bat_priv->tvlv.handler_list_lock); + hlist_del_rcu(&tvlv_handler->list); + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); + batadv_tvlv_handler_put(tvlv_handler); +} + +/** + * batadv_tvlv_unicast_send - send a unicast packet with tvlv payload to the + * specified host + * @bat_priv: the bat priv with all the soft interface information + * @src: source mac address of the unicast packet + * @dst: destination mac address of the unicast packet + * @type: tvlv type + * @version: tvlv version + * @tvlv_value: tvlv content + * @tvlv_value_len: tvlv content length + */ +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, + u8 *dst, u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len) +{ + struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; + struct batadv_tvlv_hdr *tvlv_hdr; + struct batadv_orig_node *orig_node; + struct sk_buff *skb; + unsigned char *tvlv_buff; + unsigned int tvlv_len; + ssize_t hdr_len = sizeof(*unicast_tvlv_packet); + + orig_node = batadv_orig_hash_find(bat_priv, dst); + if (!orig_node) + return; + + tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len; + + skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + hdr_len + tvlv_len); + if (!skb) + goto out; + + skb->priority = TC_PRIO_CONTROL; + skb_reserve(skb, ETH_HLEN); + tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len); + unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff; + unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV; + unicast_tvlv_packet->version = BATADV_COMPAT_VERSION; + unicast_tvlv_packet->ttl = BATADV_TTL; + unicast_tvlv_packet->reserved = 0; + unicast_tvlv_packet->tvlv_len = htons(tvlv_len); + unicast_tvlv_packet->align = 0; + ether_addr_copy(unicast_tvlv_packet->src, src); + ether_addr_copy(unicast_tvlv_packet->dst, dst); + + tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1); + tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff; + tvlv_hdr->version = version; + tvlv_hdr->type = type; + tvlv_hdr->len = htons(tvlv_value_len); + tvlv_buff += sizeof(*tvlv_hdr); + memcpy(tvlv_buff, tvlv_value, tvlv_value_len); + + if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP) + kfree_skb(skb); +out: + batadv_orig_node_put(orig_node); +} diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h new file mode 100644 index 000000000000..e4369b547b43 --- /dev/null +++ b/net/batman-adv/tvlv.h @@ -0,0 +1,61 @@ +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _NET_BATMAN_ADV_TVLV_H_ +#define _NET_BATMAN_ADV_TVLV_H_ + +#include "main.h" + +#include + +struct batadv_ogm_packet; + +void batadv_tvlv_container_register(struct batadv_priv *bat_priv, + u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len); +u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, + unsigned char **packet_buff, + int *packet_buff_len, int packet_min_len); +void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, + struct batadv_ogm_packet *batadv_ogm_packet, + struct batadv_orig_node *orig_node); +void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, + u8 type, u8 version); + +void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, + void (*optr)(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 flags, + void *tvlv_value, + u16 tvlv_value_len), + int (*uptr)(struct batadv_priv *bat_priv, + u8 *src, u8 *dst, + void *tvlv_value, + u16 tvlv_value_len), + u8 type, u8 version, u8 flags); +void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, + u8 type, u8 version); +int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, + bool ogm_source, + struct batadv_orig_node *orig_node, + u8 *src, u8 *dst, + void *tvlv_buff, u16 tvlv_buff_len); +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, + u8 *dst, u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len); + +#endif /* _NET_BATMAN_ADV_TVLV_H_ */ -- cgit v1.2.3 From 687937ab34896d9c39b80b68d304c68ca3c2b207 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Tue, 10 May 2016 18:41:25 +0200 Subject: batman-adv: Add multicast optimization support for bridged setups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this patch we are finally able to support multicast optimizations in bridged setups, too. So far, if a bridge was added on top of a soft-interface (e.g. bat0) the batman-adv multicast optimizations needed to be disabled to avoid packetloss. Current Linux bridge implementations and API can now provide us with the so far missing information about interested but "remote" multicast receivers behind bridge ports. The Linux bridge performs the detection of remote participants interested in multicast packets with its own and mature so called IGMP and MLD snooping code and stores that in its database. With the new API provided by the bridge batman-adv can now simply hook into this database. We then reliably announce the gathered multicast listeners to other nodes through the batman-adv translation table. Additionally, the Linux bridge provides us with the information about whether an IGMP/MLD querier exists. If there is none then we need to disable multicast optimizations as we cannot learn about multicast listeners on external, bridged-in host then. Tested-by: Simon Wunderlich Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/Kconfig | 2 +- net/batman-adv/multicast.c | 181 ++++++++++++++++++++++++++++++++++++++++----- net/batman-adv/types.h | 13 ++++ 3 files changed, 177 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index b7ba97dbf4a9..833bb145ba3c 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -66,7 +66,7 @@ config BATMAN_ADV_NC config BATMAN_ADV_MCAST bool "Multicast optimisation" - depends on BATMAN_ADV && INET + depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) default n help This option enables the multicast optimisation which aims to diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 4673328def29..eb30316e22fa 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -45,18 +47,53 @@ #include #include #include +#include +#include #include #include "packet.h" #include "translation-table.h" +/** + * batadv_mcast_get_bridge - get the bridge on top of the softif if it exists + * @soft_iface: netdev struct of the mesh interface + * + * If the given soft interface has a bridge on top then the refcount + * of the according net device is increased. + * + * Return: NULL if no such bridge exists. Otherwise the net device of the + * bridge. + */ +static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) +{ + struct net_device *upper = soft_iface; + + rcu_read_lock(); + do { + upper = netdev_master_upper_dev_get_rcu(upper); + } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); + + if (upper) + dev_hold(upper); + rcu_read_unlock(); + + return upper; +} + /** * batadv_mcast_mla_softif_get - get softif multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * - * Collect multicast addresses of the local multicast listeners - * on the given soft interface, dev, in the given mcast_list. + * Collects multicast addresses of multicast listeners residing + * on this kernel on the given soft interface, dev, in + * the given mcast_list. In general, multicast listeners provided by + * your multicast receiving applications run directly on this node. + * + * If there is a bridge interface on top of dev, collects from that one + * instead. Just like with IP addresses and routes, multicast listeners + * will(/should) register to the bridge interface instead of an + * enslaved bat0. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. @@ -64,12 +101,13 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, struct hlist_head *mcast_list) { + struct net_device *bridge = batadv_mcast_get_bridge(dev); struct netdev_hw_addr *mc_list_entry; struct batadv_hw_addr *new; int ret = 0; - netif_addr_lock_bh(dev); - netdev_for_each_mc_addr(mc_list_entry, dev) { + netif_addr_lock_bh(bridge ? bridge : dev); + netdev_for_each_mc_addr(mc_list_entry, bridge ? bridge : dev) { new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; @@ -80,7 +118,10 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, hlist_add_head(&new->list, mcast_list); ret++; } - netif_addr_unlock_bh(dev); + netif_addr_unlock_bh(bridge ? bridge : dev); + + if (bridge) + dev_put(bridge); return ret; } @@ -105,6 +146,83 @@ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, return false; } +/** + * batadv_mcast_mla_br_addr_cpy - copy a bridge multicast address + * @dst: destination to write to - a multicast MAC address + * @src: source to read from - a multicast IP address + * + * Converts a given multicast IPv4/IPv6 address from a bridge + * to its matching multicast MAC address and copies it into the given + * destination buffer. + * + * Caller needs to make sure the destination buffer can hold + * at least ETH_ALEN bytes. + */ +static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) +{ + if (src->proto == htons(ETH_P_IP)) + ip_eth_mc_map(src->u.ip4, dst); +#if IS_ENABLED(CONFIG_IPV6) + else if (src->proto == htons(ETH_P_IPV6)) + ipv6_eth_mc_map(&src->u.ip6, dst); +#endif + else + eth_zero_addr(dst); +} + +/** + * batadv_mcast_mla_bridge_get - get bridged-in multicast listeners + * @dev: a bridge slave whose bridge to collect multicast addresses from + * @mcast_list: a list to put found addresses into + * + * Collects multicast addresses of multicast listeners residing + * on foreign, non-mesh devices which we gave access to our mesh via + * a bridge on top of the given soft interface, dev, in the given + * mcast_list. + * + * Return: -ENOMEM on memory allocation error or the number of + * items added to the mcast_list otherwise. + */ +static int batadv_mcast_mla_bridge_get(struct net_device *dev, + struct hlist_head *mcast_list) +{ + struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); + struct br_ip_list *br_ip_entry, *tmp; + struct batadv_hw_addr *new; + u8 mcast_addr[ETH_ALEN]; + int ret; + + /* we don't need to detect these devices/listeners, the IGMP/MLD + * snooping code of the Linux bridge already does that for us + */ + ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); + if (ret < 0) + goto out; + + list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { + batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); + if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) + continue; + + new = kmalloc(sizeof(*new), GFP_ATOMIC); + if (!new) { + ret = -ENOMEM; + break; + } + + ether_addr_copy(new->addr, mcast_addr); + hlist_add_head(&new->list, mcast_list); + } + +out: + list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { + list_del(&br_ip_entry->list); + kfree(br_ip_entry); + } + + return ret; +} + /** * batadv_mcast_mla_list_free - free a list of multicast addresses * @bat_priv: the bat priv with all the soft interface information @@ -222,29 +340,51 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. * - * Return: true if the tvlv container is registered afterwards. Otherwise - * returns false. + * Return: false if we want all IPv4 && IPv6 multicast traffic and true + * otherwise. */ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) { struct batadv_tvlv_mcast_data mcast_data; + struct batadv_mcast_querier_state querier4 = {false, false}; + struct batadv_mcast_querier_state querier6 = {false, false}; + struct net_device *dev = bat_priv->soft_iface; mcast_data.flags = BATADV_NO_FLAGS; memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); - /* Avoid attaching MLAs, if there is a bridge on top of our soft - * interface, we don't support that yet (TODO) + bat_priv->mcast.bridged = batadv_mcast_has_bridge(bat_priv); + if (!bat_priv->mcast.bridged) + goto update; + +#if !IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING) + pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); +#endif + + querier4.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); + querier4.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); + + querier6.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); + querier6.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); + + mcast_data.flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; + + /* 1) If no querier exists at all, then multicast listeners on + * our local TT clients behind the bridge will keep silent. + * 2) If the selected querier is on one of our local TT clients, + * behind the bridge, then this querier might shadow multicast + * listeners on our local TT clients, behind this bridge. + * + * In both cases, we will signalize other batman nodes that + * we need all multicast traffic of the according protocol. */ - if (batadv_mcast_has_bridge(bat_priv)) { - if (bat_priv->mcast.enabled) { - batadv_tvlv_container_unregister(bat_priv, - BATADV_TVLV_MCAST, 2); - bat_priv->mcast.enabled = false; - } + if (!querier4.exists || querier4.shadowing) + mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV4; - return false; - } + if (!querier6.exists || querier6.shadowing) + mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV6; +update: if (!bat_priv->mcast.enabled || mcast_data.flags != bat_priv->mcast.flags) { batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, @@ -253,7 +393,8 @@ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) bat_priv->mcast.enabled = true; } - return true; + return !(mcast_data.flags & + (BATADV_MCAST_WANT_ALL_IPV4 + BATADV_MCAST_WANT_ALL_IPV6)); } /** @@ -276,6 +417,10 @@ void batadv_mcast_mla_update(struct batadv_priv *bat_priv) if (ret < 0) goto out; + ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list); + if (ret < 0) + goto out; + update: batadv_mcast_mla_tt_retract(bat_priv, &mcast_list); batadv_mcast_mla_tt_add(bat_priv, &mcast_list); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 32c6d0e42fde..83303c2f4631 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -754,6 +754,17 @@ struct batadv_priv_dat { #endif #ifdef CONFIG_BATMAN_ADV_MCAST +/** + * struct batadv_mcast_querier_state - IGMP/MLD querier state when bridged + * @exists: whether a querier exists in the mesh + * @shadowing: if a querier exists, whether it is potentially shadowing + * multicast listeners (i.e. querier is behind our own bridge segment) + */ +struct batadv_mcast_querier_state { + bool exists; + bool shadowing; +}; + /** * struct batadv_priv_mcast - per mesh interface mcast data * @mla_list: list of multicast addresses we are currently announcing via TT @@ -763,6 +774,7 @@ struct batadv_priv_dat { * @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast traffic * @flags: the flags we have last sent in our mcast tvlv * @enabled: whether the multicast tvlv is currently enabled + * @bridged: whether the soft interface has a bridge on top * @num_disabled: number of nodes that have no mcast tvlv * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP traffic * @num_want_all_ipv4: counter for items in want_all_ipv4_list @@ -777,6 +789,7 @@ struct batadv_priv_mcast { struct hlist_head want_all_ipv6_list; u8 flags; bool enabled; + bool bridged; atomic_t num_disabled; atomic_t num_want_all_unsnoopables; atomic_t num_want_all_ipv4; -- cgit v1.2.3 From 01d350d14712d1e8dbf2b00c82d2fc7c48d34e04 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 15 May 2016 11:07:44 +0200 Subject: batman-adv: move bat_algo functions into a separate file The bat_algo functionality in main.c is mostly unrelated to the rest of the content. It still takes up a large portion of this source file (~15%, 103 lines). Moving it to a separate file makes it better visible as a main component of the batman-adv implementation and hides it less in the other helper functions in main.c. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/Makefile | 1 + net/batman-adv/bat_algo.c | 140 +++++++++++++++++++++++++++++++++++++ net/batman-adv/bat_algo.h | 12 ++++ net/batman-adv/bat_v_ogm.c | 1 + net/batman-adv/debugfs.c | 1 + net/batman-adv/main.c | 105 +--------------------------- net/batman-adv/main.h | 4 -- net/batman-adv/originator.c | 1 + net/batman-adv/routing.c | 1 + net/batman-adv/soft-interface.c | 1 + net/batman-adv/translation-table.c | 1 + 11 files changed, 160 insertions(+), 108 deletions(-) create mode 100644 net/batman-adv/bat_algo.c (limited to 'net') diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 5c6ece0cfc17..5260c17e2069 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -17,6 +17,7 @@ # obj-$(CONFIG_BATMAN_ADV) += batman-adv.o +batman-adv-y += bat_algo.o batman-adv-y += bat_iv_ogm.o batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v.o batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c new file mode 100644 index 000000000000..610d4de0f6b0 --- /dev/null +++ b/net/batman-adv/bat_algo.c @@ -0,0 +1,140 @@ +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "bat_algo.h" + +char batadv_routing_algo[20] = "BATMAN_IV"; +static struct hlist_head batadv_algo_list; + +/** + * batadv_algo_init - Initialize batman-adv algorithm management data structures + */ +void batadv_algo_init(void) +{ + INIT_HLIST_HEAD(&batadv_algo_list); +} + +static struct batadv_algo_ops *batadv_algo_get(char *name) +{ + struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp; + + hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) { + if (strcmp(bat_algo_ops_tmp->name, name) != 0) + continue; + + bat_algo_ops = bat_algo_ops_tmp; + break; + } + + return bat_algo_ops; +} + +int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) +{ + struct batadv_algo_ops *bat_algo_ops_tmp; + + bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name); + if (bat_algo_ops_tmp) { + pr_info("Trying to register already registered routing algorithm: %s\n", + bat_algo_ops->name); + return -EEXIST; + } + + /* all algorithms must implement all ops (for now) */ + if (!bat_algo_ops->bat_iface_enable || + !bat_algo_ops->bat_iface_disable || + !bat_algo_ops->bat_iface_update_mac || + !bat_algo_ops->bat_primary_iface_set || + !bat_algo_ops->bat_neigh_cmp || + !bat_algo_ops->bat_neigh_is_similar_or_better) { + pr_info("Routing algo '%s' does not implement required ops\n", + bat_algo_ops->name); + return -EINVAL; + } + + INIT_HLIST_NODE(&bat_algo_ops->list); + hlist_add_head(&bat_algo_ops->list, &batadv_algo_list); + + return 0; +} + +int batadv_algo_select(struct batadv_priv *bat_priv, char *name) +{ + struct batadv_algo_ops *bat_algo_ops; + + bat_algo_ops = batadv_algo_get(name); + if (!bat_algo_ops) + return -EINVAL; + + bat_priv->bat_algo_ops = bat_algo_ops; + + return 0; +} + +int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) +{ + struct batadv_algo_ops *bat_algo_ops; + + seq_puts(seq, "Available routing algorithms:\n"); + + hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { + seq_printf(seq, " * %s\n", bat_algo_ops->name); + } + + return 0; +} + +static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) +{ + struct batadv_algo_ops *bat_algo_ops; + char *algo_name = (char *)val; + size_t name_len = strlen(algo_name); + + if (name_len > 0 && algo_name[name_len - 1] == '\n') + algo_name[name_len - 1] = '\0'; + + bat_algo_ops = batadv_algo_get(algo_name); + if (!bat_algo_ops) { + pr_err("Routing algorithm '%s' is not supported\n", algo_name); + return -EINVAL; + } + + return param_set_copystring(algo_name, kp); +} + +static const struct kernel_param_ops batadv_param_ops_ra = { + .set = batadv_param_set_ra, + .get = param_get_string, +}; + +static struct kparam_string batadv_param_string_ra = { + .maxlen = sizeof(batadv_routing_algo), + .string = batadv_routing_algo, +}; + +module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, + 0644); diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 36542962de7d..8c7e761ff23b 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -20,8 +20,20 @@ #include "main.h" +#include + +struct seq_file; + int batadv_iv_init(void); +extern char batadv_routing_algo[]; +extern struct list_head batadv_hardif_list; + +void batadv_algo_init(void); +int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops); +int batadv_algo_select(struct batadv_priv *bat_priv, char *name); +int batadv_algo_seq_print_text(struct seq_file *seq, void *offset); + #ifdef CONFIG_BATMAN_ADV_BATMAN_V int batadv_v_init(void); diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index ca5a679d112f..93e3d760bfe0 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -39,6 +39,7 @@ #include #include +#include "bat_algo.h" #include "hard-interface.h" #include "hash.h" #include "originator.h" diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index f187a8ff2184..227c84b9db03 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -44,6 +44,7 @@ #include #include +#include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 225d63e0c711..c5a7cab0f567 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -68,8 +67,6 @@ struct list_head batadv_hardif_list; static int (*batadv_rx_handler[256])(struct sk_buff *, struct batadv_hard_iface *); -char batadv_routing_algo[20] = "BATMAN_IV"; -static struct hlist_head batadv_algo_list; unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; @@ -80,7 +77,7 @@ static void batadv_recv_handler_init(void); static int __init batadv_init(void) { INIT_LIST_HEAD(&batadv_hardif_list); - INIT_HLIST_HEAD(&batadv_algo_list); + batadv_algo_init(); batadv_recv_handler_init(); @@ -535,76 +532,6 @@ void batadv_recv_handler_unregister(u8 packet_type) batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet; } -static struct batadv_algo_ops *batadv_algo_get(char *name) -{ - struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp; - - hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) { - if (strcmp(bat_algo_ops_tmp->name, name) != 0) - continue; - - bat_algo_ops = bat_algo_ops_tmp; - break; - } - - return bat_algo_ops; -} - -int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) -{ - struct batadv_algo_ops *bat_algo_ops_tmp; - - bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name); - if (bat_algo_ops_tmp) { - pr_info("Trying to register already registered routing algorithm: %s\n", - bat_algo_ops->name); - return -EEXIST; - } - - /* all algorithms must implement all ops (for now) */ - if (!bat_algo_ops->bat_iface_enable || - !bat_algo_ops->bat_iface_disable || - !bat_algo_ops->bat_iface_update_mac || - !bat_algo_ops->bat_primary_iface_set || - !bat_algo_ops->bat_neigh_cmp || - !bat_algo_ops->bat_neigh_is_similar_or_better) { - pr_info("Routing algo '%s' does not implement required ops\n", - bat_algo_ops->name); - return -EINVAL; - } - - INIT_HLIST_NODE(&bat_algo_ops->list); - hlist_add_head(&bat_algo_ops->list, &batadv_algo_list); - - return 0; -} - -int batadv_algo_select(struct batadv_priv *bat_priv, char *name) -{ - struct batadv_algo_ops *bat_algo_ops; - - bat_algo_ops = batadv_algo_get(name); - if (!bat_algo_ops) - return -EINVAL; - - bat_priv->bat_algo_ops = bat_algo_ops; - - return 0; -} - -int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) -{ - struct batadv_algo_ops *bat_algo_ops; - - seq_puts(seq, "Available routing algorithms:\n"); - - hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { - seq_printf(seq, " * %s\n", bat_algo_ops->name); - } - - return 0; -} - /** * batadv_skb_crc32 - calculate CRC32 of the whole packet and skip bytes in * the header @@ -691,36 +618,6 @@ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) return ap_isolation_enabled; } -static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) -{ - struct batadv_algo_ops *bat_algo_ops; - char *algo_name = (char *)val; - size_t name_len = strlen(algo_name); - - if (name_len > 0 && algo_name[name_len - 1] == '\n') - algo_name[name_len - 1] = '\0'; - - bat_algo_ops = batadv_algo_get(algo_name); - if (!bat_algo_ops) { - pr_err("Routing algorithm '%s' is not supported\n", algo_name); - return -EINVAL; - } - - return param_set_copystring(algo_name, kp); -} - -static const struct kernel_param_ops batadv_param_ops_ra = { - .set = batadv_param_set_ra, - .get = param_get_string, -}; - -static struct kparam_string batadv_param_string_ra = { - .maxlen = sizeof(batadv_routing_algo), - .string = batadv_routing_algo, -}; - -module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, - 0644); module_init(batadv_init); module_exit(batadv_exit); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 3af6582aab8b..3ec62853e519 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -196,7 +196,6 @@ struct sk_buff; #define BATADV_PRINT_VID(vid) ((vid & BATADV_VLAN_HAS_TAG) ? \ (int)(vid & VLAN_VID_MASK) : -1) -extern char batadv_routing_algo[]; extern struct list_head batadv_hardif_list; extern unsigned char batadv_broadcast_addr[]; @@ -217,9 +216,6 @@ batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)); void batadv_recv_handler_unregister(u8 packet_type); -int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops); -int batadv_algo_select(struct batadv_priv *bat_priv, char *name); -int batadv_algo_seq_print_text(struct seq_file *seq, void *offset); __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); /** diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 076d258c92e1..592cbda283e3 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -34,6 +34,7 @@ #include #include +#include "bat_algo.h" #include "distributed-arp-table.h" #include "fragmentation.h" #include "gateway_client.h" diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 8cb459a57219..b9c7325ea0aa 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -34,6 +34,7 @@ #include #include +#include "bat_algo.h" #include "bitarray.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index b60999da134d..f75631e21e48 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -48,6 +48,7 @@ #include #include +#include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "debugfs.h" #include "distributed-arp-table.h" diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 6c8d624e4581..5c3cf7ffc77e 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -44,6 +44,7 @@ #include #include +#include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" -- cgit v1.2.3 From 72f7b2deafde895012f93fa4827d4b1307a138e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Tue, 10 May 2016 18:41:26 +0200 Subject: batman-adv: Adding logging of mcast flag changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this patch changes relevant to a node's own multicast flags are printed to the 'mcast' log level. Tested-by: Simon Wunderlich Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 4 +- net/batman-adv/multicast.c | 133 +++++++++++++++++++++++++++++++++++++++- net/batman-adv/soft-interface.c | 4 ++ net/batman-adv/types.h | 4 ++ 4 files changed, 142 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index c356d91ed780..cd83e2824f70 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -231,6 +231,7 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); * @BATADV_DBG_BLA: bridge loop avoidance messages * @BATADV_DBG_DAT: ARP snooping and DAT related messages * @BATADV_DBG_NC: network coding related messages + * @BATADV_DBG_MCAST: multicast related messages * @BATADV_DBG_ALL: the union of all the above log levels */ enum batadv_dbg_level { @@ -240,7 +241,8 @@ enum batadv_dbg_level { BATADV_DBG_BLA = BIT(3), BATADV_DBG_DAT = BIT(4), BATADV_DBG_NC = BIT(5), - BATADV_DBG_ALL = 63, + BATADV_DBG_MCAST = BIT(6), + BATADV_DBG_ALL = 127, }; #ifdef CONFIG_BATMAN_ADV_DEBUG diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index eb30316e22fa..2d1a896fb66b 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -333,6 +334,122 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) return upper; } +/** + * batadv_mcast_querier_log - debug output regarding the querier status on link + * @bat_priv: the bat priv with all the soft interface information + * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD") + * @old_state: the previous querier state on our link + * @new_state: the new querier state on our link + * + * Outputs debug messages to the logging facility with log level 'mcast' + * regarding changes to the querier status on the link which are relevant + * to our multicast optimizations. + * + * Usually this is about whether a querier appeared or vanished in + * our mesh or whether the querier is in the suboptimal position of being + * behind our local bridge segment: Snooping switches will directly + * forward listener reports to the querier, therefore batman-adv and + * the bridge will potentially not see these listeners - the querier is + * potentially shadowing listeners from us then. + * + * This is only interesting for nodes with a bridge on top of their + * soft interface. + */ +static void +batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, + struct batadv_mcast_querier_state *old_state, + struct batadv_mcast_querier_state *new_state) +{ + if (!old_state->exists && new_state->exists) + batadv_info(bat_priv->soft_iface, "%s Querier appeared\n", + str_proto); + else if (old_state->exists && !new_state->exists) + batadv_info(bat_priv->soft_iface, + "%s Querier disappeared - multicast optimizations disabled\n", + str_proto); + else if (!bat_priv->mcast.bridged && !new_state->exists) + batadv_info(bat_priv->soft_iface, + "No %s Querier present - multicast optimizations disabled\n", + str_proto); + + if (new_state->exists) { + if ((!old_state->shadowing && new_state->shadowing) || + (!old_state->exists && new_state->shadowing)) + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "%s Querier is behind our bridged segment: Might shadow listeners\n", + str_proto); + else if (old_state->shadowing && !new_state->shadowing) + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "%s Querier is not behind our bridged segment\n", + str_proto); + } +} + +/** + * batadv_mcast_bridge_log - debug output for topology changes in bridged setups + * @bat_priv: the bat priv with all the soft interface information + * @bridged: a flag about whether the soft interface is currently bridged or not + * @querier_ipv4: (maybe) new status of a potential, selected IGMP querier + * @querier_ipv6: (maybe) new status of a potential, selected MLD querier + * + * If no bridges are ever used on this node, then this function does nothing. + * + * Otherwise this function outputs debug information to the 'mcast' log level + * which might be relevant to our multicast optimizations. + * + * More precisely, it outputs information when a bridge interface is added or + * removed from a soft interface. And when a bridge is present, it further + * outputs information about the querier state which is relevant for the + * multicast flags this node is going to set. + */ +static void +batadv_mcast_bridge_log(struct batadv_priv *bat_priv, bool bridged, + struct batadv_mcast_querier_state *querier_ipv4, + struct batadv_mcast_querier_state *querier_ipv6) +{ + if (!bat_priv->mcast.bridged && bridged) + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "Bridge added: Setting Unsnoopables(U)-flag\n"); + else if (bat_priv->mcast.bridged && !bridged) + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "Bridge removed: Unsetting Unsnoopables(U)-flag\n"); + + if (bridged) { + batadv_mcast_querier_log(bat_priv, "IGMP", + &bat_priv->mcast.querier_ipv4, + querier_ipv4); + batadv_mcast_querier_log(bat_priv, "MLD", + &bat_priv->mcast.querier_ipv6, + querier_ipv6); + } +} + +/** + * batadv_mcast_flags_logs - output debug information about mcast flag changes + * @bat_priv: the bat priv with all the soft interface information + * @flags: flags indicating the new multicast state + * + * Whenever the multicast flags this nodes announces changes (@mcast_flags vs. + * bat_priv->mcast.flags), this notifies userspace via the 'mcast' log level. + */ +static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) +{ + u8 old_flags = bat_priv->mcast.flags; + char str_old_flags[] = "[...]"; + + sprintf(str_old_flags, "[%c%c%c]", + (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', + (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', + (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + + batadv_dbg(BATADV_DBG_MCAST, bat_priv, + "Changing multicast flags from '%s' to '[%c%c%c]'\n", + bat_priv->mcast.enabled ? str_old_flags : "", + (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); +} + /** * batadv_mcast_mla_tvlv_update - update multicast tvlv * @bat_priv: the bat priv with all the soft interface information @@ -349,12 +466,13 @@ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) struct batadv_mcast_querier_state querier4 = {false, false}; struct batadv_mcast_querier_state querier6 = {false, false}; struct net_device *dev = bat_priv->soft_iface; + bool bridged; mcast_data.flags = BATADV_NO_FLAGS; memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); - bat_priv->mcast.bridged = batadv_mcast_has_bridge(bat_priv); - if (!bat_priv->mcast.bridged) + bridged = batadv_mcast_has_bridge(bat_priv); + if (!bridged) goto update; #if !IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING) @@ -385,8 +503,19 @@ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV6; update: + batadv_mcast_bridge_log(bat_priv, bridged, &querier4, &querier6); + + bat_priv->mcast.querier_ipv4.exists = querier4.exists; + bat_priv->mcast.querier_ipv4.shadowing = querier4.shadowing; + + bat_priv->mcast.querier_ipv6.exists = querier6.exists; + bat_priv->mcast.querier_ipv6.shadowing = querier6.shadowing; + + bat_priv->mcast.bridged = bridged; + if (!bat_priv->mcast.enabled || mcast_data.flags != bat_priv->mcast.flags) { + batadv_mcast_flags_log(bat_priv, mcast_data.flags); batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, &mcast_data, sizeof(mcast_data)); bat_priv->mcast.flags = mcast_data.flags; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 81665b159a41..b60999da134d 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -808,6 +808,10 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST + bat_priv->mcast.querier_ipv4.exists = false; + bat_priv->mcast.querier_ipv4.shadowing = false; + bat_priv->mcast.querier_ipv6.exists = false; + bat_priv->mcast.querier_ipv6.shadowing = false; bat_priv->mcast.flags = BATADV_NO_FLAGS; atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->mcast.num_disabled, 0); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 83303c2f4631..ab863a5ab2b8 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -772,6 +772,8 @@ struct batadv_mcast_querier_state { * multicast traffic * @want_all_ipv4_list: a list of orig_nodes wanting all IPv4 multicast traffic * @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast traffic + * @querier_ipv4: the current state of an IGMP querier in the mesh + * @querier_ipv6: the current state of an MLD querier in the mesh * @flags: the flags we have last sent in our mcast tvlv * @enabled: whether the multicast tvlv is currently enabled * @bridged: whether the soft interface has a bridge on top @@ -787,6 +789,8 @@ struct batadv_priv_mcast { struct hlist_head want_all_unsnoopables_list; struct hlist_head want_all_ipv4_list; struct hlist_head want_all_ipv6_list; + struct batadv_mcast_querier_state querier_ipv4; + struct batadv_mcast_querier_state querier_ipv6; u8 flags; bool enabled; bool bridged; -- cgit v1.2.3 From ba412080fb6461b5a40dbc5e44186ed029d67b8d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 15 May 2016 23:48:31 +0200 Subject: batman-adv: Consolidate logging related functions There are several places in batman-adv which provide logging related functions. These should be grouped together in the log.* files to make them easier to find. Reported-by: Markus Pargmann Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/Makefile | 1 + net/batman-adv/bat_iv_ogm.c | 1 + net/batman-adv/bat_v_elp.c | 1 + net/batman-adv/bat_v_ogm.c | 1 + net/batman-adv/bitarray.c | 2 + net/batman-adv/bridge_loop_avoidance.c | 1 + net/batman-adv/debugfs.c | 216 +----------------------------- net/batman-adv/distributed-arp-table.c | 1 + net/batman-adv/gateway_client.c | 1 + net/batman-adv/gateway_common.c | 1 + net/batman-adv/hard-interface.c | 1 + net/batman-adv/icmp_socket.c | 1 + net/batman-adv/log.c | 231 +++++++++++++++++++++++++++++++++ net/batman-adv/log.h | 109 ++++++++++++++++ net/batman-adv/main.c | 2 + net/batman-adv/main.h | 69 +--------- net/batman-adv/multicast.c | 1 + net/batman-adv/network-coding.c | 1 + net/batman-adv/originator.c | 1 + net/batman-adv/routing.c | 1 + net/batman-adv/send.c | 1 + net/batman-adv/sysfs.c | 1 + net/batman-adv/translation-table.c | 1 + 23 files changed, 364 insertions(+), 282 deletions(-) create mode 100644 net/batman-adv/log.c create mode 100644 net/batman-adv/log.h (limited to 'net') diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 5260c17e2069..a55f4ec97068 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -32,6 +32,7 @@ batman-adv-y += gateway_common.o batman-adv-y += hard-interface.o batman-adv-y += hash.o batman-adv-y += icmp_socket.o +batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o batman-adv-y += main.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 948a5b45474d..805532a95860 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -52,6 +52,7 @@ #include "bitarray.h" #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "network-coding.h" #include "originator.h" #include "packet.h" diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index cf0262becd08..15cf2726d6a5 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -43,6 +43,7 @@ #include "bat_algo.h" #include "bat_v_ogm.h" #include "hard-interface.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "routing.h" diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 93e3d760bfe0..7ac9e0b30618 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -42,6 +42,7 @@ #include "bat_algo.h" #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "routing.h" diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index a0c7913837a5..032271421a20 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -20,6 +20,8 @@ #include +#include "log.h" + /* shift the packet array by n places. */ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n) { diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 748a9ead7ce5..e4f7494fb974 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -48,6 +48,7 @@ #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "sysfs.h" diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 227c84b9db03..1d68b6e63b96 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -18,37 +18,26 @@ #include "debugfs.h" #include "main.h" -#include #include #include #include #include -#include #include -#include -#include -#include #include -#include #include #include /* for linux/wait.h */ #include -#include -#include #include #include #include #include -#include -#include -#include -#include #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "icmp_socket.h" +#include "log.h" #include "multicast.h" #include "network-coding.h" #include "originator.h" @@ -56,209 +45,6 @@ static struct dentry *batadv_debugfs; -#ifdef CONFIG_BATMAN_ADV_DEBUG -#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1) - -static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; - -static char *batadv_log_char_addr(struct batadv_priv_debug_log *debug_log, - size_t idx) -{ - return &debug_log->log_buff[idx & BATADV_LOG_BUFF_MASK]; -} - -static void batadv_emit_log_char(struct batadv_priv_debug_log *debug_log, - char c) -{ - char *char_addr; - - char_addr = batadv_log_char_addr(debug_log, debug_log->log_end); - *char_addr = c; - debug_log->log_end++; - - if (debug_log->log_end - debug_log->log_start > batadv_log_buff_len) - debug_log->log_start = debug_log->log_end - batadv_log_buff_len; -} - -__printf(2, 3) -static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, - const char *fmt, ...) -{ - va_list args; - static char debug_log_buf[256]; - char *p; - - if (!debug_log) - return 0; - - spin_lock_bh(&debug_log->lock); - va_start(args, fmt); - vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args); - va_end(args); - - for (p = debug_log_buf; *p != 0; p++) - batadv_emit_log_char(debug_log, *p); - - spin_unlock_bh(&debug_log->lock); - - wake_up(&debug_log->queue_wait); - - return 0; -} - -int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) -{ - va_list args; - char tmp_log_buf[256]; - - va_start(args, fmt); - vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args); - batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s", - jiffies_to_msecs(jiffies), tmp_log_buf); - va_end(args); - - return 0; -} - -static int batadv_log_open(struct inode *inode, struct file *file) -{ - if (!try_module_get(THIS_MODULE)) - return -EBUSY; - - nonseekable_open(inode, file); - file->private_data = inode->i_private; - return 0; -} - -static int batadv_log_release(struct inode *inode, struct file *file) -{ - module_put(THIS_MODULE); - return 0; -} - -static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log) -{ - return !(debug_log->log_start - debug_log->log_end); -} - -static ssize_t batadv_log_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct batadv_priv *bat_priv = file->private_data; - struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; - int error, i = 0; - char *char_addr; - char c; - - if ((file->f_flags & O_NONBLOCK) && batadv_log_empty(debug_log)) - return -EAGAIN; - - if (!buf) - return -EINVAL; - - if (count == 0) - return 0; - - if (!access_ok(VERIFY_WRITE, buf, count)) - return -EFAULT; - - error = wait_event_interruptible(debug_log->queue_wait, - (!batadv_log_empty(debug_log))); - - if (error) - return error; - - spin_lock_bh(&debug_log->lock); - - while ((!error) && (i < count) && - (debug_log->log_start != debug_log->log_end)) { - char_addr = batadv_log_char_addr(debug_log, - debug_log->log_start); - c = *char_addr; - - debug_log->log_start++; - - spin_unlock_bh(&debug_log->lock); - - error = __put_user(c, buf); - - spin_lock_bh(&debug_log->lock); - - buf++; - i++; - } - - spin_unlock_bh(&debug_log->lock); - - if (!error) - return i; - - return error; -} - -static unsigned int batadv_log_poll(struct file *file, poll_table *wait) -{ - struct batadv_priv *bat_priv = file->private_data; - struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; - - poll_wait(file, &debug_log->queue_wait, wait); - - if (!batadv_log_empty(debug_log)) - return POLLIN | POLLRDNORM; - - return 0; -} - -static const struct file_operations batadv_log_fops = { - .open = batadv_log_open, - .release = batadv_log_release, - .read = batadv_log_read, - .poll = batadv_log_poll, - .llseek = no_llseek, -}; - -static int batadv_debug_log_setup(struct batadv_priv *bat_priv) -{ - struct dentry *d; - - if (!bat_priv->debug_dir) - goto err; - - bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC); - if (!bat_priv->debug_log) - goto err; - - spin_lock_init(&bat_priv->debug_log->lock); - init_waitqueue_head(&bat_priv->debug_log->queue_wait); - - d = debugfs_create_file("log", S_IFREG | S_IRUSR, - bat_priv->debug_dir, bat_priv, - &batadv_log_fops); - if (!d) - goto err; - - return 0; - -err: - return -ENOMEM; -} - -static void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) -{ - kfree(bat_priv->debug_log); - bat_priv->debug_log = NULL; -} -#else /* CONFIG_BATMAN_ADV_DEBUG */ -static int batadv_debug_log_setup(struct batadv_priv *bat_priv) -{ - return 0; -} - -static void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) -{ -} -#endif - static int batadv_algorithms_open(struct inode *inode, struct file *file) { return single_open(file, batadv_algo_seq_print_text, NULL); diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 584b82744699..fa7646532a13 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -45,6 +45,7 @@ #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "originator.h" #include "send.h" #include "translation-table.h" diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 18c3715e5e27..63a805d3f96e 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -42,6 +42,7 @@ #include "gateway_common.h" #include "hard-interface.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "routing.h" diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 6a6f2d4987e5..d7bc6a87bcc9 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -28,6 +28,7 @@ #include #include "gateway_client.h" +#include "log.h" #include "packet.h" #include "tvlv.h" diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index a3483f60c3a1..ad2c37c5583b 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -42,6 +42,7 @@ #include "debugfs.h" #include "distributed-arp-table.h" #include "gateway_client.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "send.h" diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 777aea10cd8f..378cc1119d66 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -45,6 +45,7 @@ #include #include "hard-interface.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "send.h" diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c new file mode 100644 index 000000000000..56dc532f7a2c --- /dev/null +++ b/net/batman-adv/log.c @@ -0,0 +1,231 @@ +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "log.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for linux/wait.h */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1) + +static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN; + +static char *batadv_log_char_addr(struct batadv_priv_debug_log *debug_log, + size_t idx) +{ + return &debug_log->log_buff[idx & BATADV_LOG_BUFF_MASK]; +} + +static void batadv_emit_log_char(struct batadv_priv_debug_log *debug_log, + char c) +{ + char *char_addr; + + char_addr = batadv_log_char_addr(debug_log, debug_log->log_end); + *char_addr = c; + debug_log->log_end++; + + if (debug_log->log_end - debug_log->log_start > batadv_log_buff_len) + debug_log->log_start = debug_log->log_end - batadv_log_buff_len; +} + +__printf(2, 3) +static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log, + const char *fmt, ...) +{ + va_list args; + static char debug_log_buf[256]; + char *p; + + if (!debug_log) + return 0; + + spin_lock_bh(&debug_log->lock); + va_start(args, fmt); + vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args); + va_end(args); + + for (p = debug_log_buf; *p != 0; p++) + batadv_emit_log_char(debug_log, *p); + + spin_unlock_bh(&debug_log->lock); + + wake_up(&debug_log->queue_wait); + + return 0; +} + +int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) +{ + va_list args; + char tmp_log_buf[256]; + + va_start(args, fmt); + vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args); + batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s", + jiffies_to_msecs(jiffies), tmp_log_buf); + va_end(args); + + return 0; +} + +static int batadv_log_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + nonseekable_open(inode, file); + file->private_data = inode->i_private; + return 0; +} + +static int batadv_log_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return 0; +} + +static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log) +{ + return !(debug_log->log_start - debug_log->log_end); +} + +static ssize_t batadv_log_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct batadv_priv *bat_priv = file->private_data; + struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; + int error, i = 0; + char *char_addr; + char c; + + if ((file->f_flags & O_NONBLOCK) && batadv_log_empty(debug_log)) + return -EAGAIN; + + if (!buf) + return -EINVAL; + + if (count == 0) + return 0; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + error = wait_event_interruptible(debug_log->queue_wait, + (!batadv_log_empty(debug_log))); + + if (error) + return error; + + spin_lock_bh(&debug_log->lock); + + while ((!error) && (i < count) && + (debug_log->log_start != debug_log->log_end)) { + char_addr = batadv_log_char_addr(debug_log, + debug_log->log_start); + c = *char_addr; + + debug_log->log_start++; + + spin_unlock_bh(&debug_log->lock); + + error = __put_user(c, buf); + + spin_lock_bh(&debug_log->lock); + + buf++; + i++; + } + + spin_unlock_bh(&debug_log->lock); + + if (!error) + return i; + + return error; +} + +static unsigned int batadv_log_poll(struct file *file, poll_table *wait) +{ + struct batadv_priv *bat_priv = file->private_data; + struct batadv_priv_debug_log *debug_log = bat_priv->debug_log; + + poll_wait(file, &debug_log->queue_wait, wait); + + if (!batadv_log_empty(debug_log)) + return POLLIN | POLLRDNORM; + + return 0; +} + +static const struct file_operations batadv_log_fops = { + .open = batadv_log_open, + .release = batadv_log_release, + .read = batadv_log_read, + .poll = batadv_log_poll, + .llseek = no_llseek, +}; + +int batadv_debug_log_setup(struct batadv_priv *bat_priv) +{ + struct dentry *d; + + if (!bat_priv->debug_dir) + goto err; + + bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC); + if (!bat_priv->debug_log) + goto err; + + spin_lock_init(&bat_priv->debug_log->lock); + init_waitqueue_head(&bat_priv->debug_log->queue_wait); + + d = debugfs_create_file("log", S_IFREG | S_IRUSR, + bat_priv->debug_dir, bat_priv, + &batadv_log_fops); + if (!d) + goto err; + + return 0; + +err: + return -ENOMEM; +} + +void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) +{ + kfree(bat_priv->debug_log); + bat_priv->debug_log = NULL; +} diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h new file mode 100644 index 000000000000..9948e56eabaa --- /dev/null +++ b/net/batman-adv/log.h @@ -0,0 +1,109 @@ +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _NET_BATMAN_ADV_LOG_H_ +#define _NET_BATMAN_ADV_LOG_H_ + +#include "main.h" + +#include +#include +#include + +#ifdef CONFIG_BATMAN_ADV_DEBUG + +int batadv_debug_log_setup(struct batadv_priv *bat_priv); +void batadv_debug_log_cleanup(struct batadv_priv *bat_priv); + +#else + +static inline int batadv_debug_log_setup(struct batadv_priv *bat_priv) +{ + return 0; +} + +static inline void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) +{ +} + +#endif + +/** + * enum batadv_dbg_level - available log levels + * @BATADV_DBG_BATMAN: OGM and TQ computations related messages + * @BATADV_DBG_ROUTES: route added / changed / deleted + * @BATADV_DBG_TT: translation table messages + * @BATADV_DBG_BLA: bridge loop avoidance messages + * @BATADV_DBG_DAT: ARP snooping and DAT related messages + * @BATADV_DBG_NC: network coding related messages + * @BATADV_DBG_MCAST: multicast related messages + * @BATADV_DBG_ALL: the union of all the above log levels + */ +enum batadv_dbg_level { + BATADV_DBG_BATMAN = BIT(0), + BATADV_DBG_ROUTES = BIT(1), + BATADV_DBG_TT = BIT(2), + BATADV_DBG_BLA = BIT(3), + BATADV_DBG_DAT = BIT(4), + BATADV_DBG_NC = BIT(5), + BATADV_DBG_MCAST = BIT(6), + BATADV_DBG_ALL = 127, +}; + +#ifdef CONFIG_BATMAN_ADV_DEBUG +int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) +__printf(2, 3); + +/* possibly ratelimited debug output */ +#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \ + do { \ + if (atomic_read(&bat_priv->log_level) & type && \ + (!ratelimited || net_ratelimit())) \ + batadv_debug_log(bat_priv, fmt, ## arg);\ + } \ + while (0) +#else /* !CONFIG_BATMAN_ADV_DEBUG */ +__printf(4, 5) +static inline void _batadv_dbg(int type __always_unused, + struct batadv_priv *bat_priv __always_unused, + int ratelimited __always_unused, + const char *fmt __always_unused, ...) +{ +} +#endif + +#define batadv_dbg(type, bat_priv, arg...) \ + _batadv_dbg(type, bat_priv, 0, ## arg) +#define batadv_dbg_ratelimited(type, bat_priv, arg...) \ + _batadv_dbg(type, bat_priv, 1, ## arg) + +#define batadv_info(net_dev, fmt, arg...) \ + do { \ + struct net_device *_netdev = (net_dev); \ + struct batadv_priv *_batpriv = netdev_priv(_netdev); \ + batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \ + pr_info("%s: " fmt, _netdev->name, ## arg); \ + } while (0) +#define batadv_err(net_dev, fmt, arg...) \ + do { \ + struct net_device *_netdev = (net_dev); \ + struct batadv_priv *_batpriv = netdev_priv(_netdev); \ + batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \ + pr_err("%s: " fmt, _netdev->name, ## arg); \ + } while (0) + +#endif /* _NET_BATMAN_ADV_LOG_H_ */ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index c5a7cab0f567..05e559c86e82 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,7 @@ #include "gateway_common.h" #include "hard-interface.h" #include "icmp_socket.h" +#include "log.h" #include "multicast.h" #include "network-coding.h" #include "originator.h" diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 3ec62853e519..857fb5a4e37a 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -175,7 +175,6 @@ enum batadv_uev_type { /* Kernel headers */ -#include #include /* for packet.h */ #include #include @@ -183,13 +182,13 @@ enum batadv_uev_type { #include /* for packet.h */ #include #include -#include #include -#include #include #include "types.h" +struct net_device; +struct packet_type; struct seq_file; struct sk_buff; @@ -218,70 +217,6 @@ batadv_recv_handler_register(u8 packet_type, void batadv_recv_handler_unregister(u8 packet_type); __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); -/** - * enum batadv_dbg_level - available log levels - * @BATADV_DBG_BATMAN: OGM and TQ computations related messages - * @BATADV_DBG_ROUTES: route added / changed / deleted - * @BATADV_DBG_TT: translation table messages - * @BATADV_DBG_BLA: bridge loop avoidance messages - * @BATADV_DBG_DAT: ARP snooping and DAT related messages - * @BATADV_DBG_NC: network coding related messages - * @BATADV_DBG_MCAST: multicast related messages - * @BATADV_DBG_ALL: the union of all the above log levels - */ -enum batadv_dbg_level { - BATADV_DBG_BATMAN = BIT(0), - BATADV_DBG_ROUTES = BIT(1), - BATADV_DBG_TT = BIT(2), - BATADV_DBG_BLA = BIT(3), - BATADV_DBG_DAT = BIT(4), - BATADV_DBG_NC = BIT(5), - BATADV_DBG_MCAST = BIT(6), - BATADV_DBG_ALL = 127, -}; - -#ifdef CONFIG_BATMAN_ADV_DEBUG -int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) -__printf(2, 3); - -/* possibly ratelimited debug output */ -#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \ - do { \ - if (atomic_read(&bat_priv->log_level) & type && \ - (!ratelimited || net_ratelimit())) \ - batadv_debug_log(bat_priv, fmt, ## arg);\ - } \ - while (0) -#else /* !CONFIG_BATMAN_ADV_DEBUG */ -__printf(4, 5) -static inline void _batadv_dbg(int type __always_unused, - struct batadv_priv *bat_priv __always_unused, - int ratelimited __always_unused, - const char *fmt __always_unused, ...) -{ -} -#endif - -#define batadv_dbg(type, bat_priv, arg...) \ - _batadv_dbg(type, bat_priv, 0, ## arg) -#define batadv_dbg_ratelimited(type, bat_priv, arg...) \ - _batadv_dbg(type, bat_priv, 1, ## arg) - -#define batadv_info(net_dev, fmt, arg...) \ - do { \ - struct net_device *_netdev = (net_dev); \ - struct batadv_priv *_batpriv = netdev_priv(_netdev); \ - batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \ - pr_info("%s: " fmt, _netdev->name, ## arg); \ - } while (0) -#define batadv_err(net_dev, fmt, arg...) \ - do { \ - struct net_device *_netdev = (net_dev); \ - struct batadv_priv *_batpriv = netdev_priv(_netdev); \ - batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \ - pr_err("%s: " fmt, _netdev->name, ## arg); \ - } while (0) - /** * batadv_compare_eth - Compare two not u16 aligned Ethernet addresses * @data1: Pointer to a six-byte array containing the Ethernet address diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 0e7d78f4f1b8..cc915073a753 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -55,6 +55,7 @@ #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "packet.h" #include "translation-table.h" #include "tvlv.h" diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index d0383dea6440..293ef4ffd4e1 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -51,6 +51,7 @@ #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "originator.h" #include "packet.h" #include "routing.h" diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 592cbda283e3..8ad17ad477e4 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -40,6 +40,7 @@ #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "multicast.h" #include "network-coding.h" #include "routing.h" diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index b9c7325ea0aa..a5b53a3fc2ba 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -41,6 +41,7 @@ #include "fragmentation.h" #include "hard-interface.h" #include "icmp_socket.h" +#include "log.h" #include "network-coding.h" #include "originator.h" #include "packet.h" diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 4e49454dfed4..3a59df26ee32 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -42,6 +42,7 @@ #include "fragmentation.h" #include "gateway_client.h" #include "hard-interface.h" +#include "log.h" #include "network-coding.h" #include "originator.h" #include "routing.h" diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 6244a9a336d0..1a7942ddf730 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -43,6 +43,7 @@ #include "gateway_client.h" #include "gateway_common.h" #include "hard-interface.h" +#include "log.h" #include "network-coding.h" #include "packet.h" #include "soft-interface.h" diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 5c3cf7ffc77e..53458d6fb87d 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -48,6 +48,7 @@ #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" +#include "log.h" #include "multicast.h" #include "originator.h" #include "packet.h" -- cgit v1.2.3 From 4e3e823b5a503235630921287f130e1d8d22d200 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Tue, 10 May 2016 18:41:27 +0200 Subject: batman-adv: Add debugfs table for mcast flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a debugfs table with originators and their according multicast flags to help users figure out why multicast optimizations might be enabled or disabled for them. Tested-by: Simon Wunderlich Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/debugfs.c | 23 ++++++++++ net/batman-adv/multicast.c | 104 +++++++++++++++++++++++++++++++++++++++++++++ net/batman-adv/multicast.h | 3 ++ 3 files changed, 130 insertions(+) (limited to 'net') diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 952900466d88..f187a8ff2184 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -48,6 +48,7 @@ #include "distributed-arp-table.h" #include "gateway_client.h" #include "icmp_socket.h" +#include "multicast.h" #include "network-coding.h" #include "originator.h" #include "translation-table.h" @@ -363,6 +364,22 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file) } #endif +#ifdef CONFIG_BATMAN_ADV_MCAST +/** + * batadv_mcast_flags_open - prepare file handler for reads from mcast_flags + * @inode: inode which was opened + * @file: file handle to be initialized + * + * Return: 0 on success or negative error number in case of failure + */ +static int batadv_mcast_flags_open(struct inode *inode, struct file *file) +{ + struct net_device *net_dev = (struct net_device *)inode->i_private; + + return single_open(file, batadv_mcast_flags_seq_print_text, net_dev); +} +#endif + #define BATADV_DEBUGINFO(_name, _mode, _open) \ struct batadv_debuginfo batadv_debuginfo_##_name = { \ .attr = { \ @@ -407,6 +424,9 @@ static BATADV_DEBUGINFO(transtable_local, S_IRUGO, #ifdef CONFIG_BATMAN_ADV_NC static BATADV_DEBUGINFO(nc_nodes, S_IRUGO, batadv_nc_nodes_open); #endif +#ifdef CONFIG_BATMAN_ADV_MCAST +static BATADV_DEBUGINFO(mcast_flags, S_IRUGO, batadv_mcast_flags_open); +#endif static struct batadv_debuginfo *batadv_mesh_debuginfos[] = { &batadv_debuginfo_neighbors, @@ -423,6 +443,9 @@ static struct batadv_debuginfo *batadv_mesh_debuginfos[] = { &batadv_debuginfo_transtable_local, #ifdef CONFIG_BATMAN_ADV_NC &batadv_debuginfo_nc_nodes, +#endif +#ifdef CONFIG_BATMAN_ADV_MCAST + &batadv_debuginfo_mcast_flags, #endif NULL, }; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 2d1a896fb66b..d3222db60fd0 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,8 @@ #include #include +#include "hard-interface.h" +#include "hash.h" #include "packet.h" #include "translation-table.h" @@ -1129,6 +1132,107 @@ void batadv_mcast_init(struct batadv_priv *bat_priv) BATADV_TVLV_HANDLER_OGM_CIFNOTFND); } +/** + * batadv_mcast_flags_print_header - print own mcast flags to debugfs table + * @bat_priv: the bat priv with all the soft interface information + * @seq: debugfs table seq_file struct + * + * Prints our own multicast flags including a more specific reason why + * they are set, that is prints the bridge and querier state too, to + * the debugfs table specified via @seq. + */ +static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv, + struct seq_file *seq) +{ + u8 flags = bat_priv->mcast.flags; + char querier4, querier6, shadowing4, shadowing6; + bool bridged = bat_priv->mcast.bridged; + + if (bridged) { + querier4 = bat_priv->mcast.querier_ipv4.exists ? '.' : '4'; + querier6 = bat_priv->mcast.querier_ipv6.exists ? '.' : '6'; + shadowing4 = bat_priv->mcast.querier_ipv4.shadowing ? '4' : '.'; + shadowing6 = bat_priv->mcast.querier_ipv6.shadowing ? '6' : '.'; + } else { + querier4 = '?'; + querier6 = '?'; + shadowing4 = '?'; + shadowing6 = '?'; + } + + seq_printf(seq, "Multicast flags (own flags: [%c%c%c])\n", + (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); + seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.'); + seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n", + querier4, querier6); + seq_printf(seq, "* Shadowing IGMP/MLD Querier [4/6]:\t%c/%c\n", + shadowing4, shadowing6); + seq_puts(seq, "-------------------------------------------\n"); + seq_printf(seq, " %-10s %s\n", "Originator", "Flags"); +} + +/** + * batadv_mcast_flags_seq_print_text - print the mcast flags of other nodes + * @seq: seq file to print on + * @offset: not used + * + * This prints a table of (primary) originators and their according + * multicast flags, including (in the header) our own. + * + * Return: always 0 + */ +int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct batadv_priv *bat_priv = netdev_priv(net_dev); + struct batadv_hard_iface *primary_if; + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct batadv_orig_node *orig_node; + struct hlist_head *head; + u8 flags; + u32 i; + + primary_if = batadv_seq_print_text_primary_if_get(seq); + if (!primary_if) + return 0; + + batadv_mcast_flags_print_header(bat_priv, seq); + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig_node->capa_initialized)) + continue; + + if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig_node->capabilities)) { + seq_printf(seq, "%pM -\n", orig_node->orig); + continue; + } + + flags = orig_node->mcast_flags; + + seq_printf(seq, "%pM [%c%c%c]\n", orig_node->orig, + (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) + ? 'U' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV4) + ? '4' : '.', + (flags & BATADV_MCAST_WANT_ALL_IPV6) + ? '6' : '.'); + } + rcu_read_unlock(); + } + + batadv_hardif_put(primary_if); + + return 0; +} + /** * batadv_mcast_free - free the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 80bceec55592..1fb00ba84907 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -20,6 +20,7 @@ #include "main.h" +struct seq_file; struct sk_buff; /** @@ -46,6 +47,8 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, void batadv_mcast_init(struct batadv_priv *bat_priv); +int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset); + void batadv_mcast_free(struct batadv_priv *bat_priv); void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node); -- cgit v1.2.3 From a2d0816608df1ca69fcdbb9135a2b6df0c65d954 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 15 May 2016 11:07:46 +0200 Subject: batman-adv: Fix bat_(iv|v) function declaration header The bat_algo.h had some functions declared which were not part of the bat_algo.c file. These are instead stored in bat_v.c and bat_iv_ogm.c. The declaration should therefore be also in bat_v.h and bat_iv_ogm,h to make them easier to find. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_algo.h | 31 ----------------------- net/batman-adv/bat_iv_ogm.c | 3 ++- net/batman-adv/bat_iv_ogm.h | 25 ++++++++++++++++++ net/batman-adv/bat_v.c | 3 ++- net/batman-adv/bat_v.h | 52 ++++++++++++++++++++++++++++++++++++++ net/batman-adv/hard-interface.c | 2 +- net/batman-adv/main.c | 2 ++ net/batman-adv/routing.c | 1 - net/batman-adv/translation-table.c | 1 - 9 files changed, 84 insertions(+), 36 deletions(-) create mode 100644 net/batman-adv/bat_iv_ogm.h create mode 100644 net/batman-adv/bat_v.h (limited to 'net') diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 8c7e761ff23b..860d773dd8fa 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -24,8 +24,6 @@ struct seq_file; -int batadv_iv_init(void); - extern char batadv_routing_algo[]; extern struct list_head batadv_hardif_list; @@ -34,33 +32,4 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops); int batadv_algo_select(struct batadv_priv *bat_priv, char *name); int batadv_algo_seq_print_text(struct seq_file *seq, void *offset); -#ifdef CONFIG_BATMAN_ADV_BATMAN_V - -int batadv_v_init(void); -void batadv_v_hardif_init(struct batadv_hard_iface *hardif); -int batadv_v_mesh_init(struct batadv_priv *bat_priv); -void batadv_v_mesh_free(struct batadv_priv *bat_priv); - -#else - -static inline int batadv_v_init(void) -{ - return 0; -} - -static inline void batadv_v_hardif_init(struct batadv_hard_iface *hardif) -{ -} - -static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv) -{ - return 0; -} - -static inline void batadv_v_mesh_free(struct batadv_priv *bat_priv) -{ -} - -#endif /* CONFIG_BATMAN_ADV_BATMAN_V */ - #endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 805532a95860..e2d8848c32c0 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -15,7 +15,7 @@ * along with this program; if not, see . */ -#include "bat_algo.h" +#include "bat_iv_ogm.h" #include "main.h" #include @@ -49,6 +49,7 @@ #include #include +#include "bat_algo.h" #include "bitarray.h" #include "hard-interface.h" #include "hash.h" diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h new file mode 100644 index 000000000000..b9f3550faaf7 --- /dev/null +++ b/net/batman-adv/bat_iv_ogm.h @@ -0,0 +1,25 @@ +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _BATMAN_ADV_BATADV_IV_OGM_H_ +#define _BATMAN_ADV_BATADV_IV_OGM_H_ + +#include "main.h" + +int batadv_iv_init(void); + +#endif /* _BATMAN_ADV_BATADV_IV_OGM_H_ */ diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index c2fea812fb48..7231440bed51 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -15,7 +15,7 @@ * along with this program; if not, see . */ -#include "bat_algo.h" +#include "bat_v.h" #include "main.h" #include @@ -31,6 +31,7 @@ #include #include +#include "bat_algo.h" #include "bat_v_elp.h" #include "bat_v_ogm.h" #include "hard-interface.h" diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h new file mode 100644 index 000000000000..83b77639729e --- /dev/null +++ b/net/batman-adv/bat_v.h @@ -0,0 +1,52 @@ +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Linus Lüssing + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _NET_BATMAN_ADV_BAT_V_H_ +#define _NET_BATMAN_ADV_BAT_V_H_ + +#include "main.h" + +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + +int batadv_v_init(void); +void batadv_v_hardif_init(struct batadv_hard_iface *hardif); +int batadv_v_mesh_init(struct batadv_priv *bat_priv); +void batadv_v_mesh_free(struct batadv_priv *bat_priv); + +#else + +static inline int batadv_v_init(void) +{ + return 0; +} + +static inline void batadv_v_hardif_init(struct batadv_hard_iface *hardif) +{ +} + +static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv) +{ + return 0; +} + +static inline void batadv_v_mesh_free(struct batadv_priv *bat_priv) +{ +} + +#endif /* CONFIG_BATMAN_ADV_BATMAN_V */ + +#endif /* _NET_BATMAN_ADV_BAT_V_H_ */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index ad2c37c5583b..70841c1e0069 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -37,7 +37,7 @@ #include #include -#include "bat_algo.h" +#include "bat_v.h" #include "bridge_loop_avoidance.h" #include "debugfs.h" #include "distributed-arp-table.h" diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 05e559c86e82..eab9d1b8a6eb 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -46,6 +46,8 @@ #include #include "bat_algo.h" +#include "bat_iv_ogm.h" +#include "bat_v.h" #include "bridge_loop_avoidance.h" #include "debugfs.h" #include "distributed-arp-table.h" diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index a5b53a3fc2ba..5833ab3008a1 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -34,7 +34,6 @@ #include #include -#include "bat_algo.h" #include "bitarray.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 53458d6fb87d..48ce7889a3e8 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -44,7 +44,6 @@ #include #include -#include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" -- cgit v1.2.3 From 80b48c445797a634d869c7e5a53e182ba2688931 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:26 +0200 Subject: bpf: don't use raw processor id in generic helper Use smp_processor_id() for the generic helper bpf_get_smp_processor_id() instead of the raw variant. This allows for preemption checks when we have DEBUG_PREEMPT, and otherwise uses the raw variant anyway. We only need to keep the raw variant for socket filters, but we can reuse the helper that is already there from cBPF side. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index cb9fc16cac46..46c88d9cec5c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -150,6 +150,12 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return raw_smp_processor_id(); } +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = __get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -2037,7 +2043,7 @@ sk_filter_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: - return &bpf_get_smp_processor_id_proto; + return &bpf_get_raw_smp_processor_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: @@ -2086,6 +2092,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_route_realm_proto; case BPF_FUNC_perf_event_output: return bpf_get_event_output_proto(); + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; default: return sk_filter_func_proto(func_id); } -- cgit v1.2.3 From 6578171a7ff0c31dc73258f93da7407510abf085 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:27 +0200 Subject: bpf: add bpf_skb_change_proto helper This patch adds a minimal helper for doing the groundwork of changing the skb->protocol in a controlled way. Currently supported is v4 to v6 and vice versa transitions, which allows f.e. for a minimal, static nat64 implementation where applications in containers that still require IPv4 can be transparently operated in an IPv6-only environment. For example, host facing veth of the container can transparently do the transitions in a programmatic way with the help of clsact qdisc and cls_bpf. Idea is to separate concerns for keeping complexity of the helper lower, which means that the programs utilize bpf_skb_change_proto(), bpf_skb_store_bytes() and bpf_lX_csum_replace() to get the job done, instead of doing everything in a single helper (and thus partially duplicating helper functionality). Also, bpf_skb_change_proto() shouldn't need to deal with raw packet data as this is done by other helpers. bpf_skb_proto_6_to_4() and bpf_skb_proto_4_to_6() unclone the skb to operate on a private one, push or pop additionally required header space and migrate the gso/gro meta data from the shared info. We do mark the gso type as dodgy so that headers are checked and segs recalculated by the gso/gro engine. The gso_size target is adapted as well. The flags argument added is currently reserved and can be used for future extensions. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 46c88d9cec5c..d983e765787a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1783,6 +1783,202 @@ const struct bpf_func_proto bpf_skb_vlan_pop_proto = { }; EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); +static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) +{ + /* Caller already did skb_cow() with len as headroom, + * so no need to do it here. + */ + skb_push(skb, len); + memmove(skb->data, skb->data + len, off); + memset(skb->data + off, 0, len); + + /* No skb_postpush_rcsum(skb, skb->data + off, len) + * needed here as it does not change the skb->csum + * result for checksum complete when summing over + * zeroed blocks. + */ + return 0; +} + +static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) +{ + /* skb_ensure_writable() is not needed here, as we're + * already working on an uncloned skb. + */ + if (unlikely(!pskb_may_pull(skb, off + len))) + return -ENOMEM; + + skb_postpull_rcsum(skb, skb->data + off, len); + memmove(skb->data + len, skb->data, off); + __skb_pull(skb, len); + + return 0; +} + +static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) +{ + bool trans_same = skb->transport_header == skb->network_header; + int ret; + + /* There's no need for __skb_push()/__skb_pull() pair to + * get to the start of the mac header as we're guaranteed + * to always start from here under eBPF. + */ + ret = bpf_skb_generic_push(skb, off, len); + if (likely(!ret)) { + skb->mac_header -= len; + skb->network_header -= len; + if (trans_same) + skb->transport_header = skb->network_header; + } + + return ret; +} + +static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) +{ + bool trans_same = skb->transport_header == skb->network_header; + int ret; + + /* Same here, __skb_push()/__skb_pull() pair not needed. */ + ret = bpf_skb_generic_pop(skb, off, len); + if (likely(!ret)) { + skb->mac_header += len; + skb->network_header += len; + if (trans_same) + skb->transport_header = skb->network_header; + } + + return ret; +} + +static int bpf_skb_proto_4_to_6(struct sk_buff *skb) +{ + const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + u32 off = skb->network_header - skb->mac_header; + int ret; + + ret = skb_cow(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to + * be changed into SKB_GSO_TCPV6. + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { + skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; + } + + /* Due to IPv6 header, MSS needs to be downgraded. */ + skb_shinfo(skb)->gso_size -= len_diff; + /* Header must be checked, and gso_segs recomputed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + skb->protocol = htons(ETH_P_IPV6); + skb_clear_hash(skb); + + return 0; +} + +static int bpf_skb_proto_6_to_4(struct sk_buff *skb) +{ + const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + u32 off = skb->network_header - skb->mac_header; + int ret; + + ret = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_pop(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to + * be changed into SKB_GSO_TCPV4. + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { + skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + } + + /* Due to IPv4 header, MSS can be upgraded. */ + skb_shinfo(skb)->gso_size += len_diff; + /* Header must be checked, and gso_segs recomputed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + skb->protocol = htons(ETH_P_IP); + skb_clear_hash(skb); + + return 0; +} + +static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) +{ + __be16 from_proto = skb->protocol; + + if (from_proto == htons(ETH_P_IP) && + to_proto == htons(ETH_P_IPV6)) + return bpf_skb_proto_4_to_6(skb); + + if (from_proto == htons(ETH_P_IPV6) && + to_proto == htons(ETH_P_IP)) + return bpf_skb_proto_6_to_4(skb); + + return -ENOTSUPP; +} + +static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + __be16 proto = (__force __be16) r2; + int ret; + + if (unlikely(flags)) + return -EINVAL; + + /* General idea is that this helper does the basic groundwork + * needed for changing the protocol, and eBPF program fills the + * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() + * and other helpers, rather than passing a raw buffer here. + * + * The rationale is to keep this minimal and without a need to + * deal with raw packet data. F.e. even if we would pass buffers + * here, the program still needs to call the bpf_lX_csum_replace() + * helpers anyway. Plus, this way we keep also separation of + * concerns, since f.e. bpf_skb_store_bytes() should only take + * care of stores. + * + * Currently, additional options and extension header space are + * not supported, but flags register is reserved so we can adapt + * that. For offloads, we mark packet as dodgy, so that headers + * need to be verified first. + */ + ret = bpf_skb_proto_xlat(skb, proto); + bpf_compute_data_end(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_proto_proto = { + .func = bpf_skb_change_proto, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_skb_data(void *func) { if (func == bpf_skb_vlan_push) @@ -1791,6 +1987,8 @@ bool bpf_helper_changes_skb_data(void *func) return true; if (func == bpf_skb_store_bytes) return true; + if (func == bpf_skb_change_proto) + return true; if (func == bpf_l3_csum_replace) return true; if (func == bpf_l4_csum_replace) @@ -2078,6 +2276,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_vlan_push_proto; case BPF_FUNC_skb_vlan_pop: return &bpf_skb_vlan_pop_proto; + case BPF_FUNC_skb_change_proto: + return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: -- cgit v1.2.3 From d2485c4242a826fdf493fd3a27b8b792965b9b9e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:28 +0200 Subject: bpf: add bpf_skb_change_type helper This work adds a helper for changing skb->pkt_type in a controlled way. We only allow a subset of possible values and can extend that in future should other use cases come up. Doing this as a helper has the advantage that errors can be handeled gracefully and thus helper kept extensible. It's a write counterpart to pkt_type member we can already read from struct __sk_buff context. Major use case is to change incoming skbs to PACKET_HOST in a programmatic way instead of having to recirculate via redirect(..., BPF_F_INGRESS), for example. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index d983e765787a..76f9a4938be4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1979,6 +1979,28 @@ static const struct bpf_func_proto bpf_skb_change_proto_proto = { .arg3_type = ARG_ANYTHING, }; +static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u32 pkt_type = r2; + + /* We only allow a restricted subset to be changed for now. */ + if (unlikely(skb->pkt_type > PACKET_OTHERHOST || + pkt_type > PACKET_OTHERHOST)) + return -EINVAL; + + skb->pkt_type = pkt_type; + return 0; +} + +static const struct bpf_func_proto bpf_skb_change_type_proto = { + .func = bpf_skb_change_type, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_skb_data(void *func) { if (func == bpf_skb_vlan_push) @@ -2278,6 +2300,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_vlan_pop_proto; case BPF_FUNC_skb_change_proto: return &bpf_skb_change_proto_proto; + case BPF_FUNC_skb_change_type: + return &bpf_skb_change_type_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: -- cgit v1.2.3 From f151d9db4c1e7f7ac202ae75f4cbc62cfc784156 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Jun 2016 22:29:41 +0200 Subject: nl80211: improve nl80211_parse_mesh_config type checking When building a kernel with W=1, the nl80211.c file causes a number of warnings, all about the same problem: net/wireless/nl80211.c: In function 'nl80211_parse_mesh_config': net/wireless/nl80211.c:5287:103: error: comparison is always false due to limited range of data type [-Werror=type-limits] net/wireless/nl80211.c:5290:96: error: comparison is always false due to limited range of data type [-Werror=type-limits] net/wireless/nl80211.c:5293:124: error: comparison is always false due to limited range of data type [-Werror=type-limits] net/wireless/nl80211.c:5295:148: error: comparison is always false due to limited range of data type [-Werror=type-limits] net/wireless/nl80211.c:5298:106: error: comparison is always false due to limited range of data type [-Werror=type-limits] net/wireless/nl80211.c:5305:116: error: comparison is always false due to limited range of data type [-Werror=type-limits] The problem is that gcc does not notice that the check is generate by a macro, so it complains about comparing an unsigned type against 0. I've tried to come up with a way to rephrase that code in a way that avoids the warnings and otherwise improves the code as well. This uses a set of new helper functions that perform the range checking, and should provide slightly better type safety than the older patch, at the expense of adding 44 lines to the code. Binary code size is basically unchanged though (20 bytes added to 126561 bytes .text). Signed-off-by: Arnd Bergmann Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 104 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c503e96bfd5a..244d552d5647 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5287,6 +5287,51 @@ static const struct nla_policy [NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG }, }; +static int nl80211_check_bool(const struct nlattr *nla, u8 min, u8 max, bool *out) +{ + u8 val = nla_get_u8(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + +static int nl80211_check_u8(const struct nlattr *nla, u8 min, u8 max, u8 *out) +{ + u8 val = nla_get_u8(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + +static int nl80211_check_u16(const struct nlattr *nla, u16 min, u16 max, u16 *out) +{ + u16 val = nla_get_u16(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + +static int nl80211_check_u32(const struct nlattr *nla, u32 min, u32 max, u32 *out) +{ + u32 val = nla_get_u32(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + +static int nl80211_check_s32(const struct nlattr *nla, s32 min, s32 max, s32 *out) +{ + s32 val = nla_get_s32(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + static int nl80211_parse_mesh_config(struct genl_info *info, struct mesh_config *cfg, u32 *mask_out) @@ -5297,9 +5342,8 @@ static int nl80211_parse_mesh_config(struct genl_info *info, #define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, min, max, mask, attr, fn) \ do { \ if (tb[attr]) { \ - if (fn(tb[attr]) < min || fn(tb[attr]) > max) \ + if (fn(tb[attr], min, max, &cfg->param)) \ return -EINVAL; \ - cfg->param = fn(tb[attr]); \ mask |= (1 << (attr - 1)); \ } \ } while (0) @@ -5318,99 +5362,99 @@ do { \ /* Fill in the params struct */ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, 1, 255, mask, NL80211_MESHCONF_RETRY_TIMEOUT, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, 1, 255, mask, NL80211_MESHCONF_CONFIRM_TIMEOUT, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, 1, 255, mask, NL80211_MESHCONF_HOLDING_TIMEOUT, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, 0, 255, mask, NL80211_MESHCONF_MAX_PEER_LINKS, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, 0, 16, mask, NL80211_MESHCONF_MAX_RETRIES, - nla_get_u8); + nl80211_check_u8); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, 1, 255, - mask, NL80211_MESHCONF_TTL, nla_get_u8); + mask, NL80211_MESHCONF_TTL, nl80211_check_u8); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, 1, 255, mask, NL80211_MESHCONF_ELEMENT_TTL, - nla_get_u8); + nl80211_check_u8); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, 0, 1, mask, NL80211_MESHCONF_AUTO_OPEN_PLINKS, - nla_get_u8); + nl80211_check_bool); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNbrOffsetMaxNeighbor, 1, 255, mask, NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, - nla_get_u32); + nl80211_check_u32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, 0, 255, mask, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, - nla_get_u8); + nl80211_check_u8); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, 1, 65535, mask, NL80211_MESHCONF_PATH_REFRESH_TIME, - nla_get_u32); + nl80211_check_u32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, 1, 65535, mask, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout, 1, 65535, mask, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, - nla_get_u32); + nl80211_check_u32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval, 1, 65535, mask, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval, 1, 65535, mask, NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPnetDiameterTraversalTime, 1, 65535, mask, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, 0, 4, mask, NL80211_MESHCONF_HWMP_ROOTMODE, - nla_get_u8); + nl80211_check_u8); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, 1, 65535, mask, NL80211_MESHCONF_HWMP_RANN_INTERVAL, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshGateAnnouncementProtocol, 0, 1, mask, NL80211_MESHCONF_GATE_ANNOUNCEMENTS, - nla_get_u8); + nl80211_check_bool); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, 0, 1, mask, NL80211_MESHCONF_FORWARDING, - nla_get_u8); + nl80211_check_bool); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, -255, 0, mask, NL80211_MESHCONF_RSSI_THRESHOLD, - nla_get_s32); + nl80211_check_s32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, ht_opmode, 0, 16, mask, NL80211_MESHCONF_HT_OPMODE, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout, 1, 65535, mask, NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, - nla_get_u32); + nl80211_check_u32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, 1, 65535, mask, NL80211_MESHCONF_HWMP_ROOT_INTERVAL, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPconfirmationInterval, 1, 65535, mask, NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, - nla_get_u16); + nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode, NL80211_MESH_POWER_ACTIVE, NL80211_MESH_POWER_MAX, mask, NL80211_MESHCONF_POWER_MODE, - nla_get_u32); + nl80211_check_u32); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, 0, 65535, mask, - NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); + NL80211_MESHCONF_AWAKE_WINDOW, nl80211_check_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 0, 0xffffffff, mask, NL80211_MESHCONF_PLINK_TIMEOUT, - nla_get_u32); + nl80211_check_u32); if (mask_out) *mask_out = mask; -- cgit v1.2.3 From 49708e3772ce648be425778702a266b207e89d4e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 27 Jun 2016 17:31:18 +0300 Subject: mac80211: silence an uninitialized variable warning We normally return an uninitialized value, but no one checks it so it doesn't matter. Anyway, let's silence the static checker warning. Signed-off-by: Dan Carpenter Signed-off-by: Johannes Berg --- net/mac80211/tdls.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 1c7d45a6d93e..b5d28f14b9cf 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1747,6 +1747,7 @@ ieee80211_process_tdls_channel_switch_resp(struct ieee80211_sub_if_data *sdata, goto out; } + ret = 0; call_drv: drv_tdls_recv_channel_switch(sdata->local, sdata, ¶ms); -- cgit v1.2.3 From 46f6b06050b736dab4d41494dae27b883cddc365 Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Wed, 22 Jun 2016 19:55:20 +0900 Subject: mac80211: Encrypt "Group addressed privacy" action frames Previously, the action frames to group address was not encrypted. But [1] "Table 8-38 Category values" indicates "Mesh" and "Multihop" category action frames should be encrypted (Group addressed privacy == yes). And the encyption key should be MGTK ([1] 10.13 Group addressed robust management frame procedures). So this patch modifies the code to make it suitable for spec. [1] IEEE Std 802.11-2012 Signed-off-by: Masashi Honma Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 7 ++++++- net/mac80211/tx.c | 6 +++++- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 9a1eb70cb120..2e8a9024625a 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1624,8 +1624,13 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (mmie_keyidx < NUM_DEFAULT_KEYS || mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) return RX_DROP_MONITOR; /* unexpected BIP keyidx */ - if (rx->sta) + if (rx->sta) { + if (ieee80211_is_group_privacy_action(skb) && + test_sta_flag(rx->sta, WLAN_STA_MFP)) + return RX_DROP_MONITOR; + rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]); + } if (!rx->key) rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]); } else if (!ieee80211_has_protected(fc)) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 44ec605a5682..fa8d38eb9236 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -593,6 +593,9 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) else if (tx->sta && (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx]))) tx->key = key; + else if (ieee80211_is_group_privacy_action(tx->skb) && + (key = rcu_dereference(tx->sdata->default_multicast_key))) + tx->key = key; else if (ieee80211_is_mgmt(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && ieee80211_is_robust_mgmt_frame(tx->skb) && @@ -625,7 +628,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) case WLAN_CIPHER_SUITE_GCMP_256: if (!ieee80211_is_data_present(hdr->frame_control) && !ieee80211_use_mfp(hdr->frame_control, tx->sta, - tx->skb)) + tx->skb) && + !ieee80211_is_group_privacy_action(tx->skb)) tx->key = NULL; else skip_hw = (tx->key->conf.flags & -- cgit v1.2.3 From efc401f49adf9c53a95f0430496c7a5433612e74 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Sat, 25 Jun 2016 19:14:16 -0400 Subject: mac80211: use common cleanup for user/!user_mpm We've accumulated a couple of different fixes now to mesh_sta_cleanup() due to the different paths that user_mpm and !user_mpm cases take -- one fix to flush nexthop paths and one to fix the counting. The only caller of mesh_plink_deactivate() is mesh_sta_cleanup(), so we can push the user_mpm checks down into there in order to share more code. In doing so, we can remove an extra call to mesh_path_flush_by_nexthop() and the (unnecessary) call to mesh_accept_plinks_update(). This will also ensure the powersaving state code gets called in the user_mpm case. The only cleanup tasks we need to avoid when MPM is in user-space are sending the peering frames and stopping the plink timer, so wrap those in the appropriate check. Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg --- net/mac80211/mesh.c | 20 +------------------- net/mac80211/mesh_plink.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 6a1603bcdced..c66411df9863 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -148,25 +148,7 @@ u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata) void mesh_sta_cleanup(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; - u32 changed = 0; - - /* - * maybe userspace handles peer allocation and peering, but in either - * case the beacon is still generated by the kernel and we might need - * an update. - */ - if (sdata->u.mesh.user_mpm && - sta->mesh->plink_state == NL80211_PLINK_ESTAB) - changed |= mesh_plink_dec_estab_count(sdata); - changed |= mesh_accept_plinks_update(sdata); - if (!sdata->u.mesh.user_mpm) { - changed |= mesh_plink_deactivate(sta); - del_timer_sync(&sta->mesh->plink_timer); - } - - /* make sure no readers can access nexthop sta from here on */ - mesh_path_flush_by_nexthop(sta); - synchronize_net(); + u32 changed = mesh_plink_deactivate(sta); if (changed) ieee80211_mbss_info_change_notify(sdata, changed); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 79f2a0a13db8..7fcdcf622655 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -370,13 +370,21 @@ u32 mesh_plink_deactivate(struct sta_info *sta) spin_lock_bh(&sta->mesh->plink_lock); changed = __mesh_plink_deactivate(sta); - sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED; - mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_CLOSE, - sta->sta.addr, sta->mesh->llid, sta->mesh->plid, - sta->mesh->reason); + + if (!sdata->u.mesh.user_mpm) { + sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED; + mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_CLOSE, + sta->sta.addr, sta->mesh->llid, + sta->mesh->plid, sta->mesh->reason); + } spin_unlock_bh(&sta->mesh->plink_lock); + if (!sdata->u.mesh.user_mpm) + del_timer_sync(&sta->mesh->plink_timer); mesh_path_flush_by_nexthop(sta); + /* make sure no readers can access nexthop sta from here on */ + synchronize_net(); + return changed; } -- cgit v1.2.3 From 59a7c828d7e7d5a1be224a0d68a41ca2302843ea Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Wed, 29 Jun 2016 14:00:34 +0200 Subject: mac80211: fix fq lockdep warnings Some lockdep assertions were not fulfilled and resulted in a kernel warning/call trace if driver used intermediate software queues (e.g. ath10k). Existing code sequences should've guaranteed safety but it's always good to be extra careful. The call trace could look like this: [ 237.335805] ------------[ cut here ]------------ [ 237.335852] WARNING: CPU: 3 PID: 1921 at include/net/fq_impl.h:22 fq_flow_dequeue+0xed/0x140 [mac80211] [ 237.335855] Modules linked in: ath10k_pci(E-) ath10k_core(E) ath(E) mac80211(E) cfg80211(E) [ 237.335913] CPU: 3 PID: 1921 Comm: rmmod Tainted: G W E 4.7.0-rc4-wt-ath+ #1377 [ 237.335916] Hardware name: Hewlett-Packard HP ProBook 6540b/1722, BIOS 68CDD Ver. F.04 01/27/2010 [ 237.335918] 00200286 00200286 eff85dac c14151e2 f901574e 00000000 eff85de0 c1081075 [ 237.335928] c1ab91f0 00000003 00000781 f901574e 00000016 f8fbabad f8fbabad 00000016 [ 237.335938] eb24ff60 00000000 ef3886c0 eff85df4 c10810ba 00000009 00000000 00000000 [ 237.335948] Call Trace: [ 237.335953] [] dump_stack+0x76/0xb4 [ 237.335957] [] __warn+0xe5/0x100 [ 237.336002] [] ? fq_flow_dequeue+0xed/0x140 [mac80211] [ 237.336046] [] ? fq_flow_dequeue+0xed/0x140 [mac80211] [ 237.336053] [] warn_slowpath_null+0x2a/0x30 [ 237.336095] [] fq_flow_dequeue+0xed/0x140 [mac80211] [ 237.336137] [] fq_flow_reset.constprop.56+0x2a/0x90 [mac80211] [ 237.336180] [] fq_reset.constprop.59+0x2a/0x50 [mac80211] [ 237.336222] [] ieee80211_txq_teardown_flows+0x38/0x40 [mac80211] [ 237.336258] [] ieee80211_unregister_hw+0xe4/0x120 [mac80211] [ 237.336275] [] ath10k_mac_unregister+0x16/0x50 [ath10k_core] [ 237.336292] [] ath10k_core_unregister+0x3d/0x90 [ath10k_core] [ 237.336301] [] ath10k_pci_remove+0x36/0xa0 [ath10k_pci] [ 237.336307] [] pci_device_remove+0x38/0xb0 ... Fixes: 5caa328e3811 ("mac80211: implement codel on fair queuing flows") Fixes: fa962b92120b ("mac80211: implement fair queueing per txq") Tested-by: Kalle Valo Reported-by: Kalle Valo Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index fa8d38eb9236..91461c415525 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1449,7 +1449,9 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) local->cvars = kcalloc(fq->flows_cnt, sizeof(local->cvars[0]), GFP_KERNEL); if (!local->cvars) { + spin_lock_bh(&fq->lock); fq_reset(fq, fq_skb_free_func); + spin_unlock_bh(&fq->lock); return -ENOMEM; } @@ -1469,7 +1471,9 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local) kfree(local->cvars); local->cvars = NULL; + spin_lock_bh(&fq->lock); fq_reset(fq, fq_skb_free_func); + spin_unlock_bh(&fq->lock); } struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, -- cgit v1.2.3 From 80e73cc563c4359be809a03bcb8e7e28141a813a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 28 Jun 2016 16:57:05 +0200 Subject: net: rtnetlink: add support for the IFLA_STATS_LINK_XSTATS_SLAVE attribute This patch adds support for the IFLA_STATS_LINK_XSTATS_SLAVE attribute which allows to export per-slave statistics if the master device supports the linkxstats callback. The attribute is passed down to the linkxstats callback and it is up to the callback user to use it (an example has been added to the only current user - the bridge). This allows us to query only specific slaves of master devices like bridge ports and export only what we're interested in instead of having to dump all ports and searching only for a single one. This will be used to export per-port IGMP/MLD stats and also per-port vlan stats in the future, possibly other statistics as well. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++--- net/core/rtnetlink.c | 50 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 104 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 85e89f693589..ed75ff9ff9e6 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1234,7 +1234,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return 0; } -static size_t br_get_linkxstats_size(const struct net_device *dev) +static size_t bridge_get_linkxstats_size(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_vlan_group *vg; @@ -1254,8 +1254,30 @@ static size_t br_get_linkxstats_size(const struct net_device *dev) nla_total_size(0); } -static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, - int *prividx) +static size_t brport_get_linkxstats_size(const struct net_device *dev) +{ + return nla_total_size(0); +} + +static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) +{ + size_t retsize = 0; + + switch (attr) { + case IFLA_STATS_LINK_XSTATS: + retsize = bridge_get_linkxstats_size(dev); + break; + case IFLA_STATS_LINK_XSTATS_SLAVE: + retsize = brport_get_linkxstats_size(dev); + break; + } + + return retsize; +} + +static int bridge_fill_linkxstats(struct sk_buff *skb, + const struct net_device *dev, + int *prividx) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_vlan_group *vg; @@ -1298,6 +1320,37 @@ nla_put_failure: return -EMSGSIZE; } +static int brport_fill_linkxstats(struct sk_buff *skb, + const struct net_device *dev, + int *prividx) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); + if (!nest) + return -EMSGSIZE; + nla_nest_end(skb, nest); + + return 0; +} + +static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, + int *prividx, int attr) +{ + int ret = -EINVAL; + + switch (attr) { + case IFLA_STATS_LINK_XSTATS: + ret = bridge_fill_linkxstats(skb, dev, prividx); + break; + case IFLA_STATS_LINK_XSTATS_SLAVE: + ret = brport_fill_linkxstats(skb, dev, prividx); + break; + } + + return ret; +} + static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, .get_link_af_size = br_get_link_af_size_filtered, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index eb49ca24274a..cfed7bc14ee6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3519,7 +3519,32 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (!attr) goto nla_put_failure; - err = ops->fill_linkxstats(skb, dev, prividx); + err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); + nla_nest_end(skb, attr); + if (err) + goto nla_put_failure; + *idxattr = 0; + } + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, + *idxattr)) { + const struct rtnl_link_ops *ops = NULL; + const struct net_device *master; + + master = netdev_master_upper_dev_get(dev); + if (master) + ops = master->rtnl_link_ops; + if (ops && ops->fill_linkxstats) { + int err; + + *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; + attr = nla_nest_start(skb, + IFLA_STATS_LINK_XSTATS_SLAVE); + if (!attr) + goto nla_put_failure; + + err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; @@ -3555,14 +3580,35 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + int attr = IFLA_STATS_LINK_XSTATS; if (ops && ops->get_linkxstats_size) { - size += nla_total_size(ops->get_linkxstats_size(dev)); + size += nla_total_size(ops->get_linkxstats_size(dev, + attr)); /* for IFLA_STATS_LINK_XSTATS */ size += nla_total_size(0); } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) { + struct net_device *_dev = (struct net_device *)dev; + const struct rtnl_link_ops *ops = NULL; + const struct net_device *master; + + /* netdev_master_upper_dev_get can't take const */ + master = netdev_master_upper_dev_get(_dev); + if (master) + ops = master->rtnl_link_ops; + if (ops && ops->get_linkxstats_size) { + int attr = IFLA_STATS_LINK_XSTATS_SLAVE; + + size += nla_total_size(ops->get_linkxstats_size(dev, + attr)); + /* for IFLA_STATS_LINK_XSTATS_SLAVE */ + size += nla_total_size(0); + } + } + return size; } -- cgit v1.2.3 From 1080ab95e3c7bdd77870e209aff83c763fdcf439 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 28 Jun 2016 16:57:06 +0200 Subject: net: bridge: add support for IGMP/MLD stats and export them via netlink This patch adds stats support for the currently used IGMP/MLD types by the bridge. The stats are per-port (plus one stat per-bridge) and per-direction (RX/TX). The stats are exported via netlink via the new linkxstats API (RTM_GETSTATS). In order to minimize the performance impact, a new option is used to enable/disable the stats - multicast_stats_enabled, similar to the recent vlan stats. Also in order to avoid multiple IGMP/MLD type lookups and checks, we make use of the current "igmp" member of the bridge private skb->cb region to record the type on Rx (both host-generated and external packets pass by multicast_rcv()). We can do that since the igmp member was used as a boolean and all the valid IGMP/MLD types are positive values. The normal bridge fast-path is not affected at all, the only affected paths are the flooding ones and since we make use of the IGMP/MLD type, we can quickly determine if the packet should be counted using cache-hot data (cb's igmp member). We add counters for: * IGMP Queries * IGMP Leaves * IGMP v1/v2/v3 reports * MLD Queries * MLD Leaves * MLD v1/v2 reports These are invaluable when monitoring or debugging complex multicast setups with bridges. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_device.c | 10 ++- net/bridge/br_forward.c | 13 ++- net/bridge/br_if.c | 9 +- net/bridge/br_input.c | 3 + net/bridge/br_multicast.c | 217 +++++++++++++++++++++++++++++++++++++++++++--- net/bridge/br_netlink.c | 91 +++++++++++++------ net/bridge/br_private.h | 41 ++++++++- net/bridge/br_sysfs_br.c | 25 ++++++ 8 files changed, 363 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 2c8095a5d824..0c39e0f6da09 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -104,8 +104,16 @@ static int br_dev_init(struct net_device *dev) return -ENOMEM; err = br_vlan_init(br); - if (err) + if (err) { free_percpu(br->stats); + return err; + } + + err = br_multicast_init_stats(br); + if (err) { + free_percpu(br->stats); + br_vlan_flush(br); + } br_set_lockdep_class(dev); return err; diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index f47759f05b6d..6c196037d818 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -198,8 +198,10 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, struct sk_buff *skb), bool unicast) { - struct net_bridge_port *p; + u8 igmp_type = br_multicast_igmp_type(skb); + __be16 proto = skb->protocol; struct net_bridge_port *prev; + struct net_bridge_port *p; prev = NULL; @@ -218,6 +220,9 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, prev = maybe_deliver(prev, p, skb, __packet_hook); if (IS_ERR(prev)) goto out; + if (prev == p) + br_multicast_count(p->br, p, proto, igmp_type, + BR_MCAST_DIR_TX); } if (!prev) @@ -257,9 +262,12 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb)) { struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; + u8 igmp_type = br_multicast_igmp_type(skb); struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; + __be16 proto = skb->protocol; + struct hlist_node *rp; rp = rcu_dereference(hlist_first_rcu(&br->router_list)); @@ -277,6 +285,9 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, prev = maybe_deliver(prev, port, skb, __packet_hook); if (IS_ERR(prev)) goto out; + if (prev == port) + br_multicast_count(port->br, port, proto, igmp_type, + BR_MCAST_DIR_TX); if ((unsigned long)lport >= (unsigned long)port) p = rcu_dereference(p->next); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 8217aecf025b..f2fede05d32c 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -345,8 +345,8 @@ static int find_portno(struct net_bridge *br) static struct net_bridge_port *new_nbp(struct net_bridge *br, struct net_device *dev) { - int index; struct net_bridge_port *p; + int index, err; index = find_portno(br); if (index < 0) @@ -366,7 +366,12 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); - br_multicast_add_port(p); + err = br_multicast_add_port(p); + if (err) { + dev_put(dev); + kfree(p); + p = ERR_PTR(err); + } return p; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 43d2cd862bc2..786602bc0567 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -60,6 +60,9 @@ static int br_pass_frame_up(struct sk_buff *skb) skb = br_handle_vlan(br, vg, skb); if (!skb) return NET_RX_DROP; + /* update the multicast stats if the packet is IGMP/MLD */ + br_multicast_count(br, NULL, skb->protocol, br_multicast_igmp_type(skb), + BR_MCAST_DIR_TX); return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 43844144c9c4..e405eef0ae2e 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -361,7 +361,8 @@ out: } static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, - __be32 group) + __be32 group, + u8 *igmp_type) { struct sk_buff *skb; struct igmphdr *ih; @@ -411,6 +412,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, skb_set_transport_header(skb, skb->len); ih = igmp_hdr(skb); + *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY; ih->type = IGMP_HOST_MEMBERSHIP_QUERY; ih->code = (group ? br->multicast_last_member_interval : br->multicast_query_response_interval) / @@ -428,7 +430,8 @@ out: #if IS_ENABLED(CONFIG_IPV6) static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, - const struct in6_addr *group) + const struct in6_addr *grp, + u8 *igmp_type) { struct sk_buff *skb; struct ipv6hdr *ip6h; @@ -487,16 +490,17 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, skb_set_transport_header(skb, skb->len); mldq = (struct mld_msg *) icmp6_hdr(skb); - interval = ipv6_addr_any(group) ? + interval = ipv6_addr_any(grp) ? br->multicast_query_response_interval : br->multicast_last_member_interval; + *igmp_type = ICMPV6_MGM_QUERY; mldq->mld_type = ICMPV6_MGM_QUERY; mldq->mld_code = 0; mldq->mld_cksum = 0; mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); mldq->mld_reserved = 0; - mldq->mld_mca = *group; + mldq->mld_mca = *grp; /* checksum */ mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -513,14 +517,16 @@ out: #endif static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, - struct br_ip *addr) + struct br_ip *addr, + u8 *igmp_type) { switch (addr->proto) { case htons(ETH_P_IP): - return br_ip4_multicast_alloc_query(br, addr->u.ip4); + return br_ip4_multicast_alloc_query(br, addr->u.ip4, igmp_type); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - return br_ip6_multicast_alloc_query(br, &addr->u.ip6); + return br_ip6_multicast_alloc_query(br, &addr->u.ip6, + igmp_type); #endif } return NULL; @@ -829,18 +835,23 @@ static void __br_multicast_send_query(struct net_bridge *br, struct br_ip *ip) { struct sk_buff *skb; + u8 igmp_type; - skb = br_multicast_alloc_query(br, ip); + skb = br_multicast_alloc_query(br, ip, &igmp_type); if (!skb) return; if (port) { skb->dev = port->dev; + br_multicast_count(br, port, skb->protocol, igmp_type, + BR_MCAST_DIR_TX); NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, dev_net(port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); } else { br_multicast_select_own_querier(br, ip, skb); + br_multicast_count(br, port, skb->protocol, igmp_type, + BR_MCAST_DIR_RX); netif_rx(skb); } } @@ -918,7 +929,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data) } #endif -void br_multicast_add_port(struct net_bridge_port *port) +int br_multicast_add_port(struct net_bridge_port *port) { port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; @@ -930,6 +941,11 @@ void br_multicast_add_port(struct net_bridge_port *port) setup_timer(&port->ip6_own_query.timer, br_ip6_multicast_port_query_expired, (unsigned long)port); #endif + port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); + if (!port->mcast_stats) + return -ENOMEM; + + return 0; } void br_multicast_del_port(struct net_bridge_port *port) @@ -944,6 +960,7 @@ void br_multicast_del_port(struct net_bridge_port *port) br_multicast_del_pg(br, pg); spin_unlock_bh(&br->multicast_lock); del_timer_sync(&port->multicast_router_timer); + free_percpu(port->mcast_stats); } static void br_multicast_enable(struct bridge_mcast_own_query *query) @@ -1583,6 +1600,39 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, } #endif +static void br_multicast_err_count(const struct net_bridge *br, + const struct net_bridge_port *p, + __be16 proto) +{ + struct bridge_mcast_stats __percpu *stats; + struct bridge_mcast_stats *pstats; + + if (!br->multicast_stats_enabled) + return; + + if (p) + stats = p->mcast_stats; + else + stats = br->mcast_stats; + if (WARN_ON(!stats)) + return; + + pstats = this_cpu_ptr(stats); + + u64_stats_update_begin(&pstats->syncp); + switch (proto) { + case htons(ETH_P_IP): + pstats->mstats.igmp_parse_errors++; + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + pstats->mstats.mld_parse_errors++; + break; +#endif + } + u64_stats_update_end(&pstats->syncp); +} + static int br_multicast_ipv4_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, @@ -1599,11 +1649,12 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; } else if (err < 0) { + br_multicast_err_count(br, port, skb->protocol); return err; } - BR_INPUT_SKB_CB(skb)->igmp = 1; ih = igmp_hdr(skb); + BR_INPUT_SKB_CB(skb)->igmp = ih->type; switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: @@ -1625,6 +1676,9 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, if (skb_trimmed && skb_trimmed != skb) kfree_skb(skb_trimmed); + br_multicast_count(br, port, skb->protocol, BR_INPUT_SKB_CB(skb)->igmp, + BR_MCAST_DIR_RX); + return err; } @@ -1645,11 +1699,12 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; } else if (err < 0) { + br_multicast_err_count(br, port, skb->protocol); return err; } - BR_INPUT_SKB_CB(skb)->igmp = 1; mld = (struct mld_msg *)skb_transport_header(skb); + BR_INPUT_SKB_CB(skb)->igmp = mld->mld_type; switch (mld->mld_type) { case ICMPV6_MGM_REPORT: @@ -1670,6 +1725,9 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, if (skb_trimmed && skb_trimmed != skb) kfree_skb(skb_trimmed); + br_multicast_count(br, port, skb->protocol, BR_INPUT_SKB_CB(skb)->igmp, + BR_MCAST_DIR_RX); + return err; } #endif @@ -1677,6 +1735,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, u16 vid) { + int ret = 0; + BR_INPUT_SKB_CB(skb)->igmp = 0; BR_INPUT_SKB_CB(skb)->mrouters_only = 0; @@ -1685,14 +1745,16 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, switch (skb->protocol) { case htons(ETH_P_IP): - return br_multicast_ipv4_rcv(br, port, skb, vid); + ret = br_multicast_ipv4_rcv(br, port, skb, vid); + break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - return br_multicast_ipv6_rcv(br, port, skb, vid); + ret = br_multicast_ipv6_rcv(br, port, skb, vid); + break; #endif } - return 0; + return ret; } static void br_multicast_query_expired(struct net_bridge *br, @@ -1831,6 +1893,8 @@ void br_multicast_dev_del(struct net_bridge *br) out: spin_unlock_bh(&br->multicast_lock); + + free_percpu(br->mcast_stats); } int br_multicast_set_router(struct net_bridge *br, unsigned long val) @@ -2185,3 +2249,128 @@ unlock: return ret; } EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); + +static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, + __be16 proto, u8 type, u8 dir) +{ + struct bridge_mcast_stats *pstats = this_cpu_ptr(stats); + + u64_stats_update_begin(&pstats->syncp); + switch (proto) { + case htons(ETH_P_IP): + switch (type) { + case IGMP_HOST_MEMBERSHIP_REPORT: + pstats->mstats.igmp_v1reports[dir]++; + break; + case IGMPV2_HOST_MEMBERSHIP_REPORT: + pstats->mstats.igmp_v2reports[dir]++; + break; + case IGMPV3_HOST_MEMBERSHIP_REPORT: + pstats->mstats.igmp_v3reports[dir]++; + break; + case IGMP_HOST_MEMBERSHIP_QUERY: + pstats->mstats.igmp_queries[dir]++; + break; + case IGMP_HOST_LEAVE_MESSAGE: + pstats->mstats.igmp_leaves[dir]++; + break; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + switch (type) { + case ICMPV6_MGM_REPORT: + pstats->mstats.mld_v1reports[dir]++; + break; + case ICMPV6_MLD2_REPORT: + pstats->mstats.mld_v2reports[dir]++; + break; + case ICMPV6_MGM_QUERY: + pstats->mstats.mld_queries[dir]++; + break; + case ICMPV6_MGM_REDUCTION: + pstats->mstats.mld_leaves[dir]++; + break; + } + break; +#endif /* CONFIG_IPV6 */ + } + u64_stats_update_end(&pstats->syncp); +} + +void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, + __be16 proto, u8 type, u8 dir) +{ + struct bridge_mcast_stats __percpu *stats; + + /* if multicast_disabled is true then igmp type can't be set */ + if (!type || !br->multicast_stats_enabled) + return; + + if (p) + stats = p->mcast_stats; + else + stats = br->mcast_stats; + if (WARN_ON(!stats)) + return; + + br_mcast_stats_add(stats, proto, type, dir); +} + +int br_multicast_init_stats(struct net_bridge *br) +{ + br->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); + if (!br->mcast_stats) + return -ENOMEM; + + return 0; +} + +static void mcast_stats_add_dir(u64 *dst, u64 *src) +{ + dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX]; + dst[BR_MCAST_DIR_TX] += src[BR_MCAST_DIR_TX]; +} + +void br_multicast_get_stats(const struct net_bridge *br, + const struct net_bridge_port *p, + struct br_mcast_stats *dest) +{ + struct bridge_mcast_stats __percpu *stats; + struct br_mcast_stats tdst; + int i; + + memset(dest, 0, sizeof(*dest)); + if (p) + stats = p->mcast_stats; + else + stats = br->mcast_stats; + if (WARN_ON(!stats)) + return; + + memset(&tdst, 0, sizeof(tdst)); + for_each_possible_cpu(i) { + struct bridge_mcast_stats *cpu_stats = per_cpu_ptr(stats, i); + struct br_mcast_stats temp; + unsigned int start; + + do { + start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + memcpy(&temp, &cpu_stats->mstats, sizeof(temp)); + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + + mcast_stats_add_dir(tdst.igmp_queries, temp.igmp_queries); + mcast_stats_add_dir(tdst.igmp_leaves, temp.igmp_leaves); + mcast_stats_add_dir(tdst.igmp_v1reports, temp.igmp_v1reports); + mcast_stats_add_dir(tdst.igmp_v2reports, temp.igmp_v2reports); + mcast_stats_add_dir(tdst.igmp_v3reports, temp.igmp_v3reports); + tdst.igmp_parse_errors += temp.igmp_parse_errors; + + mcast_stats_add_dir(tdst.mld_queries, temp.mld_queries); + mcast_stats_add_dir(tdst.mld_leaves, temp.mld_leaves); + mcast_stats_add_dir(tdst.mld_v1reports, temp.mld_v1reports); + mcast_stats_add_dir(tdst.mld_v2reports, temp.mld_v2reports); + tdst.mld_parse_errors += temp.mld_parse_errors; + } + memcpy(dest, &tdst, sizeof(*dest)); +} diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index ed75ff9ff9e6..f2a29e467e78 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -851,6 +851,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1055,6 +1056,13 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], br->multicast_startup_query_interval = clock_t_to_jiffies(val); } + + if (data[IFLA_BR_MCAST_STATS_ENABLED]) { + __u8 mcast_stats; + + mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]); + br->multicast_stats_enabled = !!mcast_stats; + } #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (data[IFLA_BR_NF_CALL_IPTABLES]) { @@ -1110,6 +1118,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_STATS_ENABLED */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ @@ -1187,6 +1196,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, br->multicast_query_use_ifaddr) || nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) || + nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, + br->multicast_stats_enabled) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, br->hash_elasticity) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || @@ -1242,21 +1253,21 @@ static size_t bridge_get_linkxstats_size(const struct net_device *dev) int numvls = 0; vg = br_vlan_group(br); - if (!vg) - return 0; - - /* we need to count all, even placeholder entries */ - list_for_each_entry(v, &vg->vlan_list, vlist) - numvls++; + if (vg) { + /* we need to count all, even placeholder entries */ + list_for_each_entry(v, &vg->vlan_list, vlist) + numvls++; + } - /* account for the vlans and the link xstats type nest attribute */ return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + + nla_total_size(sizeof(struct br_mcast_stats)) + nla_total_size(0); } static size_t brport_get_linkxstats_size(const struct net_device *dev) { - return nla_total_size(0); + return nla_total_size(sizeof(struct br_mcast_stats)) + + nla_total_size(0); } static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) @@ -1280,37 +1291,50 @@ static int bridge_fill_linkxstats(struct sk_buff *skb, int *prividx) { struct net_bridge *br = netdev_priv(dev); + struct nlattr *nla __maybe_unused; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct nlattr *nest; int vl_idx = 0; - vg = br_vlan_group(br); - if (!vg) - goto out; nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); if (!nest) return -EMSGSIZE; - list_for_each_entry(v, &vg->vlan_list, vlist) { - struct bridge_vlan_xstats vxi; - struct br_vlan_stats stats; - if (++vl_idx < *prividx) - continue; - memset(&vxi, 0, sizeof(vxi)); - vxi.vid = v->vid; - br_vlan_get_stats(v, &stats); - vxi.rx_bytes = stats.rx_bytes; - vxi.rx_packets = stats.rx_packets; - vxi.tx_bytes = stats.tx_bytes; - vxi.tx_packets = stats.tx_packets; - - if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi)) + vg = br_vlan_group(br); + if (vg) { + list_for_each_entry(v, &vg->vlan_list, vlist) { + struct bridge_vlan_xstats vxi; + struct br_vlan_stats stats; + + if (++vl_idx < *prividx) + continue; + memset(&vxi, 0, sizeof(vxi)); + vxi.vid = v->vid; + br_vlan_get_stats(v, &stats); + vxi.rx_bytes = stats.rx_bytes; + vxi.rx_packets = stats.rx_packets; + vxi.tx_bytes = stats.tx_bytes; + vxi.tx_packets = stats.tx_packets; + + if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi)) + goto nla_put_failure; + } + } + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (++vl_idx >= *prividx) { + nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST, + sizeof(struct br_mcast_stats), + BRIDGE_XSTATS_PAD); + if (!nla) goto nla_put_failure; + br_multicast_get_stats(br, NULL, nla_data(nla)); } +#endif nla_nest_end(skb, nest); *prividx = 0; -out: + return 0; nla_put_failure: @@ -1324,11 +1348,26 @@ static int brport_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, int *prividx) { + struct net_bridge_port *p = br_port_get_rtnl(dev); + struct nlattr *nla __maybe_unused; struct nlattr *nest; + if (!p) + return 0; + nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); if (!nest) return -EMSGSIZE; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST, + sizeof(struct br_mcast_stats), + BRIDGE_XSTATS_PAD); + if (!nla) { + nla_nest_end(skb, nest); + return -EMSGSIZE; + } + br_multicast_get_stats(p->br, p, nla_data(nla)); +#endif nla_nest_end(skb, nest); return 0; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 52edecf3c294..4dc851166ad1 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -75,6 +75,12 @@ struct bridge_mcast_querier { struct br_ip addr; struct net_bridge_port __rcu *port; }; + +/* IGMP/MLD statistics */ +struct bridge_mcast_stats { + struct br_mcast_stats mstats; + struct u64_stats_sync syncp; +}; #endif struct br_vlan_stats { @@ -229,6 +235,7 @@ struct net_bridge_port struct bridge_mcast_own_query ip6_own_query; #endif /* IS_ENABLED(CONFIG_IPV6) */ unsigned char multicast_router; + struct bridge_mcast_stats __percpu *mcast_stats; struct timer_list multicast_router_timer; struct hlist_head mglist; struct hlist_node rlist; @@ -315,6 +322,7 @@ struct net_bridge u8 multicast_querier:1; u8 multicast_query_use_ifaddr:1; u8 has_ipv6_addr:1; + u8 multicast_stats_enabled:1; u32 hash_elasticity; u32 hash_max; @@ -337,6 +345,7 @@ struct net_bridge struct bridge_mcast_other_query ip4_other_query; struct bridge_mcast_own_query ip4_own_query; struct bridge_mcast_querier ip4_querier; + struct bridge_mcast_stats __percpu *mcast_stats; #if IS_ENABLED(CONFIG_IPV6) struct bridge_mcast_other_query ip6_other_query; struct bridge_mcast_own_query ip6_own_query; @@ -543,7 +552,7 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, u16 vid); struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, struct sk_buff *skb, u16 vid); -void br_multicast_add_port(struct net_bridge_port *port); +int br_multicast_add_port(struct net_bridge_port *port); void br_multicast_del_port(struct net_bridge_port *port); void br_multicast_enable_port(struct net_bridge_port *port); void br_multicast_disable_port(struct net_bridge_port *port); @@ -576,6 +585,12 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, struct br_ip *group, int type, u8 flags); void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, int type); +void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, + __be16 proto, u8 type, u8 dir); +int br_multicast_init_stats(struct net_bridge *br); +void br_multicast_get_stats(const struct net_bridge *br, + const struct net_bridge_port *p, + struct br_mcast_stats *dest); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -623,6 +638,11 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br, return false; } } + +static inline int br_multicast_igmp_type(const struct sk_buff *skb) +{ + return BR_INPUT_SKB_CB(skb)->igmp; +} #else static inline int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, @@ -638,8 +658,9 @@ static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, return NULL; } -static inline void br_multicast_add_port(struct net_bridge_port *port) +static inline int br_multicast_add_port(struct net_bridge_port *port) { + return 0; } static inline void br_multicast_del_port(struct net_bridge_port *port) @@ -695,6 +716,22 @@ static inline void br_mdb_init(void) static inline void br_mdb_uninit(void) { } + +static inline void br_multicast_count(struct net_bridge *br, + const struct net_bridge_port *p, + __be16 proto, u8 type, u8 dir) +{ +} + +static inline int br_multicast_init_stats(struct net_bridge *br) +{ + return 0; +} + +static inline int br_multicast_igmp_type(const struct sk_buff *skb) +{ + return 0; +} #endif /* br_vlan.c */ diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index beb47071e38d..e120307c6e36 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -618,6 +618,30 @@ static ssize_t multicast_startup_query_interval_store( return store_bridge_parm(d, buf, len, set_startup_query_interval); } static DEVICE_ATTR_RW(multicast_startup_query_interval); + +static ssize_t multicast_stats_enabled_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + + return sprintf(buf, "%u\n", br->multicast_stats_enabled); +} + +static int set_stats_enabled(struct net_bridge *br, unsigned long val) +{ + br->multicast_stats_enabled = !!val; + return 0; +} + +static ssize_t multicast_stats_enabled_store(struct device *d, + struct device_attribute *attr, + const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_stats_enabled); +} +static DEVICE_ATTR_RW(multicast_stats_enabled); #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( @@ -784,6 +808,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_multicast_query_interval.attr, &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, + &dev_attr_multicast_stats_enabled.attr, #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, -- cgit v1.2.3 From b1ed4c4fa9a5ccf325184fd90edc50978ef6e33a Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 27 Jun 2016 15:33:56 -0700 Subject: tcp: add an ability to dump and restore window parameters We found that sometimes a restored tcp socket doesn't work. A reason of this bug is incorrect window parameters and in this case tcp_acceptable_seq() returns tcp_wnd_end(tp) instead of tp->snd_nxt. The other side drops packets with this seq, because seq is less than tp->rcv_nxt ( tcp_sequence() ). Data from a send queue is sent only if there is enough space in a window, so when we restore unacked data, we need to expand a window to fit this data. This was in a first version of this patch: "tcp: extend window to fit all restored unacked data in a send queue" Then Alexey recommended me to restore window parameters instead of adjusted them according with data in a sent queue. This sounds resonable. rcv_wnd has to be restored, because it was reported to another side and the offered window is never shrunk. One of reasons why we need to restore snd_wnd was described above. Cc: Pavel Emelyanov Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5c7ed147449c..108ef2a6665c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2277,6 +2277,38 @@ static inline bool tcp_can_repair_sock(const struct sock *sk) ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); } +static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) +{ + struct tcp_repair_window opt; + + if (!tp->repair) + return -EPERM; + + if (len != sizeof(opt)) + return -EINVAL; + + if (copy_from_user(&opt, optbuf, sizeof(opt))) + return -EFAULT; + + if (opt.max_window < opt.snd_wnd) + return -EINVAL; + + if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd)) + return -EINVAL; + + if (after(opt.rcv_wup, tp->rcv_nxt)) + return -EINVAL; + + tp->snd_wl1 = opt.snd_wl1; + tp->snd_wnd = opt.snd_wnd; + tp->max_window = opt.max_window; + + tp->rcv_wnd = opt.rcv_wnd; + tp->rcv_wup = opt.rcv_wup; + + return 0; +} + static int tcp_repair_options_est(struct tcp_sock *tp, struct tcp_repair_opt __user *optbuf, unsigned int len) { @@ -2604,6 +2636,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->tsoffset = val - tcp_time_stamp; break; + case TCP_REPAIR_WINDOW: + err = tcp_repair_set_window(tp, optval, optlen); + break; case TCP_NOTSENT_LOWAT: tp->notsent_lowat = val; sk->sk_write_space(sk); @@ -2860,6 +2895,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EINVAL; break; + case TCP_REPAIR_WINDOW: { + struct tcp_repair_window opt; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len != sizeof(opt)) + return -EINVAL; + + if (!tp->repair) + return -EPERM; + + opt.snd_wl1 = tp->snd_wl1; + opt.snd_wnd = tp->snd_wnd; + opt.max_window = tp->max_window; + opt.rcv_wnd = tp->rcv_wnd; + opt.rcv_wup = tp->rcv_wup; + + if (copy_to_user(optval, &opt, len)) + return -EFAULT; + return 0; + } case TCP_QUEUE_SEQ: if (tp->repair_queue == TCP_SEND_QUEUE) val = tp->write_seq; -- cgit v1.2.3 From 153380ec4b9b6802bba61ebd34da432a54994e9d Mon Sep 17 00:00:00 2001 From: Mateusz Bajorski Date: Wed, 29 Jun 2016 09:22:10 +0200 Subject: fib_rules: Added NLM_F_EXCL support to fib_nl_newrule When adding rule with NLM_F_EXCL flag then check if the same rule exist. If yes then exit with -EEXIST. This is already implemented in iproute2: if (cmd == RTM_NEWRULE) { req.n.nlmsg_flags |= NLM_F_CREATE|NLM_F_EXCL; req.r.rtm_type = RTN_UNICAST; } Tested ipv4 and ipv6 with net-next linux on qemu x86 expected behavior after patch: localhost ~ # ip rule 0: from all lookup local 32766: from all lookup main 32767: from all lookup default localhost ~ # ip rule add from 10.46.177.97 lookup 104 pref 1005 localhost ~ # ip rule add from 10.46.177.97 lookup 104 pref 1005 RTNETLINK answers: File exists localhost ~ # ip rule 0: from all lookup local 1005: from 10.46.177.97 lookup 104 32766: from all lookup main 32767: from all lookup default There was already topic regarding this but I don't see any changes merged and problem still occurs. https://lkml.kernel.org/r/1135778809.5944.7.camel+%28%29+localhost+%21+localdomain Signed-off-by: Mateusz Bajorski Acked-by: David Ahern Signed-off-by: David S. Miller --- net/core/fib_rules.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 98298b11f534..be4629c344a6 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -269,6 +269,49 @@ errout: return err; } +static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, + struct nlattr **tb, struct fib_rule *rule) +{ + struct fib_rule *r; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->action != rule->action) + continue; + + if (r->table != rule->table) + continue; + + if (r->pref != rule->pref) + continue; + + if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) + continue; + + if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) + continue; + + if (r->mark != rule->mark) + continue; + + if (r->mark_mask != rule->mark_mask) + continue; + + if (r->tun_id != rule->tun_id) + continue; + + if (r->fr_net != rule->fr_net) + continue; + + if (r->l3mdev != rule->l3mdev) + continue; + + if (!ops->compare(r, frh, tb)) + continue; + return 1; + } + return 0; +} + int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -386,6 +429,12 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (rule->l3mdev && rule->table) goto errout_free; + if ((nlh->nlmsg_flags & NLM_F_EXCL) && + rule_exists(ops, frh, tb, rule)) { + err = -EEXIST; + goto errout_free; + } + err = ops->configure(rule, skb, frh, tb); if (err < 0) goto errout_free; -- cgit v1.2.3 From ac5d26836cb6c01505d186180a79b4362ee7b4ac Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 1 Jul 2016 08:35:02 +0100 Subject: rxrpc: Fix processing of authenticated/encrypted jumbo packets When a jumbo packet is being split up and processed, the crypto checksum for each split-out packet is in the jumbo header and needs placing in the reconstructed packet header. When the code was changed to keep the stored copy of the packet header in host byte order, this reconstruction was missed. Found with sparse with CF=-D__CHECK_ENDIAN__: ../net/rxrpc/input.c:479:33: warning: incorrect type in assignment (different base types) ../net/rxrpc/input.c:479:33: expected unsigned short [unsigned] [usertype] _rsvd ../net/rxrpc/input.c:479:33: got restricted __be16 [addressable] [usertype] _rsvd Fixes: 0d12f8a4027d021c9cc942f09f38d28288020c5d ("rxrpc: Keep the skb private record of the Rx header in host byte order") Signed-off-by: David Howells --- net/rxrpc/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index f4bd57b77b93..5f26cae43069 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -476,7 +476,7 @@ static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, sp->hdr.seq += 1; sp->hdr.serial += 1; sp->hdr.flags = jhdr.flags; - sp->hdr._rsvd = jhdr._rsvd; + sp->hdr._rsvd = ntohs(jhdr._rsvd); _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1); -- cgit v1.2.3 From 19689e38eca5d7b32755182d4e62efd7a5376c45 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Jun 2016 18:51:53 +0200 Subject: tcp: md5: use kmalloc() backed scratch areas Some arches have virtually mapped kernel stacks, or will soon have. tcp_md5_hash_header() uses an automatic variable to copy tcp header before mangling th->check and calling crypto function, which might be problematic on such arches. David says that using percpu storage is also problematic on non SMP builds. Just use kmalloc() to allocate scratch areas. Signed-off-by: Eric Dumazet Reported-by: Andy Lutomirski Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 ++++++++++ net/ipv4/tcp_ipv4.c | 31 ++++++++++++++----------------- net/ipv6/tcp_ipv6.c | 29 ++++++++++++++++------------- 3 files changed, 40 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 108ef2a6665c..032a96d78c99 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3026,8 +3026,18 @@ static void __tcp_alloc_md5sig_pool(void) return; for_each_possible_cpu(cpu) { + void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch; struct ahash_request *req; + if (!scratch) { + scratch = kmalloc_node(sizeof(union tcp_md5sum_block) + + sizeof(struct tcphdr), + GFP_KERNEL, + cpu_to_node(cpu)); + if (!scratch) + return; + per_cpu(tcp_md5sig_pool, cpu).scratch = scratch; + } if (per_cpu(tcp_md5sig_pool, cpu).md5_req) continue; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3708de2a6683..32b048e524d6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1018,27 +1018,28 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, GFP_KERNEL); } -static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, - __be32 daddr, __be32 saddr, int nbytes) +static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, + __be32 daddr, __be32 saddr, + const struct tcphdr *th, int nbytes) { struct tcp4_pseudohdr *bp; struct scatterlist sg; + struct tcphdr *_th; - bp = &hp->md5_blk.ip4; - - /* - * 1. the TCP pseudo-header (in the order: source IP address, - * destination IP address, zero-padded protocol number, and - * segment length) - */ + bp = hp->scratch; bp->saddr = saddr; bp->daddr = daddr; bp->pad = 0; bp->protocol = IPPROTO_TCP; bp->len = cpu_to_be16(nbytes); - sg_init_one(&sg, bp, sizeof(*bp)); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp)); + _th = (struct tcphdr *)(bp + 1); + memcpy(_th, th, sizeof(*th)); + _th->check = 0; + + sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, + sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->md5_req); } @@ -1055,9 +1056,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; @@ -1101,9 +1100,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) goto clear_hash; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2255d2bf5f6b..37cf91323319 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -526,26 +526,33 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval, AF_INET6, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } -static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, - const struct in6_addr *daddr, - const struct in6_addr *saddr, int nbytes) +static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + const struct tcphdr *th, int nbytes) { struct tcp6_pseudohdr *bp; struct scatterlist sg; + struct tcphdr *_th; - bp = &hp->md5_blk.ip6; + bp = hp->scratch; /* 1. TCP pseudo-header (RFC2460) */ bp->saddr = *saddr; bp->daddr = *daddr; bp->protocol = cpu_to_be32(IPPROTO_TCP); bp->len = cpu_to_be32(nbytes); - sg_init_one(&sg, bp, sizeof(*bp)); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp)); + _th = (struct tcphdr *)(bp + 1); + memcpy(_th, th, sizeof(*th)); + _th->check = 0; + + sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, + sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->md5_req); } -static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, +static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, const struct in6_addr *daddr, struct in6_addr *saddr, const struct tcphdr *th) { @@ -559,9 +566,7 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; @@ -606,9 +611,7 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) goto clear_hash; -- cgit v1.2.3 From 12d0ad3be9c3854e52ec74bb83bb6f43612827c7 Mon Sep 17 00:00:00 2001 From: Michal Soltys Date: Thu, 30 Jun 2016 02:26:44 +0200 Subject: net/sched/sch_hfsc.c: handle corner cases where head may change invalidating calculated deadline Realtime scheduling implemented in HFSC uses head of the queue to make the decision about which packet to schedule next. But in case of any head drop, the deadline calculated for the previous head is not necessarily correct for the next head (unless both packets have the same length). Thanks to peek() function used during dequeue - which internally is a dequeue operation - hfsc is almost safe from this issue, as peek() dequeues and isolates the head storing it temporarily until the real dequeue happens. But there is one exception: if after the class activation a drop happens before the first dequeue operation, there's never a chance to do the peek(). Adding peek() call in enqueue - if this is the first packet in a new backlog period AND the scheduler has realtime curve defined - fixes that one corner case. The 1st hfsc_dequeue() will use that peeked packet, similarly as every subsequent hfsc_dequeue() call uses packet peeked by the previous call. Signed-off-by: Michal Soltys Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 8cb5eff7b79c..6d6df6b2d38f 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1594,8 +1594,17 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) return err; } - if (cl->qdisc->q.qlen == 1) + if (cl->qdisc->q.qlen == 1) { set_active(cl, qdisc_pkt_len(skb)); + /* + * If this is the first packet, isolate the head so an eventual + * head drop before the first dequeue operation has no chance + * to invalidate the deadline. + */ + if (cl->cl_flags & HFSC_RSC) + cl->qdisc->ops->peek(cl->qdisc); + + } qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; -- cgit v1.2.3 From d1d0fc5e4c6822c5dadd9389297c7c1b8eea314f Mon Sep 17 00:00:00 2001 From: Michal Soltys Date: Thu, 30 Jun 2016 02:26:45 +0200 Subject: net/sched/sch_hfsc.c: add unlikely() in qdisc_peek_len() The condition can only succeed on wrong configurations. Signed-off-by: Michal Soltys Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 6d6df6b2d38f..e2244bb78130 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -882,7 +882,7 @@ qdisc_peek_len(struct Qdisc *sch) unsigned int len; skb = sch->ops->peek(sch); - if (skb == NULL) { + if (unlikely(skb == NULL)) { qdisc_warn_nonwc("qdisc_peek_len", sch); return 0; } -- cgit v1.2.3 From 2354f056f6847125a95b42732ab481730389c099 Mon Sep 17 00:00:00 2001 From: Michal Soltys Date: Thu, 30 Jun 2016 02:26:46 +0200 Subject: net/sched/sch_hfsc.c: remove leftover dlist and droplist This is update to: commit a09ceb0e08140a ("sched: remove qdisc->drop") That commit removed qdisc->drop, but left alone dlist and droplist that no longer serve any meaningful purpose. Signed-off-by: Michal Soltys Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index e2244bb78130..df07f060c0e9 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -130,7 +130,6 @@ struct hfsc_class { struct rb_node vt_node; /* parent's vt_tree member */ struct rb_root cf_tree; /* active children sorted by cl_f */ struct rb_node cf_node; /* parent's cf_heap member */ - struct list_head dlist; /* drop list member */ u64 cl_total; /* total work in bytes */ u64 cl_cumul; /* cumulative work in bytes done by @@ -177,8 +176,6 @@ struct hfsc_sched { struct hfsc_class root; /* root class */ struct Qdisc_class_hash clhash; /* class hash */ struct rb_root eligible; /* eligible tree */ - struct list_head droplist; /* active leaf class list (for - dropping) */ struct qdisc_watchdog watchdog; /* watchdog timer */ }; @@ -858,7 +855,6 @@ set_active(struct hfsc_class *cl, unsigned int len) if (cl->cl_flags & HFSC_FSC) init_vf(cl, len); - list_add_tail(&cl->dlist, &cl->sched->droplist); } static void @@ -867,8 +863,6 @@ set_passive(struct hfsc_class *cl) if (cl->cl_flags & HFSC_RSC) eltree_remove(cl); - list_del(&cl->dlist); - /* * vttree is now handled in update_vf() so that update_vf(cl, 0, 0) * needs to be called explicitly to remove a class from vttree. @@ -1443,7 +1437,6 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) if (err < 0) return err; q->eligible = RB_ROOT; - INIT_LIST_HEAD(&q->droplist); q->root.cl_common.classid = sch->handle; q->root.refcnt = 1; @@ -1527,7 +1520,6 @@ hfsc_reset_qdisc(struct Qdisc *sch) hfsc_reset_class(cl); } q->eligible = RB_ROOT; - INIT_LIST_HEAD(&q->droplist); qdisc_watchdog_cancel(&q->watchdog); sch->qstats.backlog = 0; sch->q.qlen = 0; -- cgit v1.2.3 From ab12cb4742cf608cfee43c84fe07fa56bd473dcb Mon Sep 17 00:00:00 2001 From: Michal Soltys Date: Thu, 30 Jun 2016 02:26:47 +0200 Subject: net/sched/sch_hfsc.c: go passive after vt update When a class is going passive, it should update its cl_vt first to be consistent with the last dequeue operation. Otherwise its cl_vt will be one packet behind and parent's cvtmax might not be updated as well. One possible side effect is if some class goes passive and subsequently goes active /without/ its parent going passive - with cl_vt lagging one packet behind - comparison made in init_vf() will be affected (same period). Signed-off-by: Michal Soltys Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index df07f060c0e9..4eef857bac4d 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -778,6 +778,20 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) else go_passive = 0; + /* update vt */ + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + - cl->cl_vtoff + cl->cl_vtadj; + + /* + * if vt of the class is smaller than cvtmin, + * the class was skipped in the past due to non-fit. + * if so, we need to adjust vtadj. + */ + if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { + cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; + cl->cl_vt = cl->cl_parent->cl_cvtmin; + } + if (go_passive) { /* no more active child, going passive */ @@ -794,25 +808,10 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) continue; } - /* - * update vt and f - */ - cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) - - cl->cl_vtoff + cl->cl_vtadj; - - /* - * if vt of the class is smaller than cvtmin, - * the class was skipped in the past due to non-fit. - * if so, we need to adjust vtadj. - */ - if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { - cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; - cl->cl_vt = cl->cl_parent->cl_cvtmin; - } - /* update the vt tree */ vttree_update(cl); + /* update f */ if (cl->cl_flags & HFSC_USC) { cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, cl->cl_total); -- cgit v1.2.3 From 33ef84a77d7502359abd097a28dbeb67d5466a7c Mon Sep 17 00:00:00 2001 From: Michal Soltys Date: Thu, 30 Jun 2016 02:26:48 +0200 Subject: net/sched/sch_hfsc.c: anchor virtual curve at proper vt in hfsc_change_fsc() cl->cl_vt alone is relative only to the current backlog period, while the curve operates on cumulative virtual time. This patch adds missing cl->cl_vtoff. Signed-off-by: Michal Soltys Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 4eef857bac4d..dff92ea772fe 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -940,7 +940,7 @@ static void hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) { sc2isc(fsc, &cl->cl_fsc); - rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vtoff + cl->cl_vt, cl->cl_total); cl->cl_flags |= HFSC_FSC; } -- cgit v1.2.3 From 08294a26e15d7baf1e14ee569e9f2bc82a7ae768 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 30 Jun 2016 14:45:35 +0800 Subject: net: introduce NETDEV_CHANGE_TX_QUEUE_LEN This patch introduces a new event - NETDEV_CHANGE_TX_QUEUE_LEN, this will be triggered when tx_queue_len. It could be used by net device who want to do some processing at that time. An example is tun who may want to resize tx array when tx_queue_len is changed. Cc: John Fastabend Signed-off-by: Jason Wang Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 15 ++++++++++++++- net/core/rtnetlink.c | 16 ++++++++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7a0b616557ab..6e4f34721080 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex); static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) { - dev->tx_queue_len = new_len; + int res, orig_len = dev->tx_queue_len; + + if (new_len != orig_len) { + dev->tx_queue_len = new_len; + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); + res = notifier_to_errno(res); + if (res) { + netdev_err(dev, + "refused to change device tx_queue_len\n"); + dev->tx_queue_len = orig_len; + return -EFAULT; + } + } + return 0; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cfed7bc14ee6..a9e3805af739 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1927,11 +1927,19 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_TXQLEN]) { unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); - - if (dev->tx_queue_len ^ value) + unsigned long orig_len = dev->tx_queue_len; + + if (dev->tx_queue_len ^ value) { + dev->tx_queue_len = value; + err = call_netdevice_notifiers( + NETDEV_CHANGE_TX_QUEUE_LEN, dev); + err = notifier_to_errno(err); + if (err) { + dev->tx_queue_len = orig_len; + goto errout; + } status |= DO_SETLINK_NOTIFY; - - dev->tx_queue_len = value; + } } if (tb[IFLA_OPERSTATE]) -- cgit v1.2.3 From 468b021b944922e8fe0a30b6b6e0532bb95e4edc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Jun 2016 19:48:30 +0200 Subject: netfilter: x_tables: simplify ip{6}table_mangle_hook() No need for a special case to handle NF_INET_POST_ROUTING, this is basically the same handling as for prerouting, input, forward. Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/iptable_mangle.c | 4 ---- net/ipv6/netfilter/ip6table_mangle.c | 4 ---- 2 files changed, 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 57fc97cdac70..aebdb337fd7e 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -87,10 +87,6 @@ iptable_mangle_hook(void *priv, { if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, state); - if (state->hook == NF_INET_POST_ROUTING) - return ipt_do_table(skb, state, - state->net->ipv4.iptable_mangle); - /* PREROUTING/INPUT/FORWARD: */ return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index cb2b28883252..2b1a9dcdbcb3 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -83,10 +83,6 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb, { if (state->hook == NF_INET_LOCAL_OUT) return ip6t_mangle_out(skb, state); - if (state->hook == NF_INET_POST_ROUTING) - return ip6t_do_table(skb, state, - state->net->ipv6.ip6table_mangle); - /* INPUT/FORWARD */ return ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle); } -- cgit v1.2.3 From 4ae89ad92477219b504a49966ee010fe8dcb85af Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 24 Jun 2016 11:32:26 -0700 Subject: etherdevice.h & bridge: netfilter: Add and use ether_addr_equal_masked There are code duplications of a masked ethernet address comparison here so make it a separate function instead. Miscellanea: o Neaten alignment of FWINV macro uses to make it clearer for the reader Signed-off-by: Joe Perches Acked-by: David S. Miller Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebt_arp.c | 17 +++++--------- net/bridge/netfilter/ebt_stp.c | 49 ++++++++++++++++++----------------------- net/bridge/netfilter/ebtables.c | 17 +++++--------- 3 files changed, 34 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c index cd457b891b27..cca0a899ee15 100644 --- a/net/bridge/netfilter/ebt_arp.c +++ b/net/bridge/netfilter/ebt_arp.c @@ -65,7 +65,6 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) { const unsigned char *mp; unsigned char _mac[ETH_ALEN]; - uint8_t verdict, i; if (ah->ar_hln != ETH_ALEN || ah->ar_hrd != htons(ARPHRD_ETHER)) return false; @@ -74,11 +73,9 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) sizeof(_mac), &_mac); if (mp == NULL) return false; - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (mp[i] ^ info->smaddr[i]) & - info->smmsk[i]; - if (FWINV(verdict != 0, EBT_ARP_SRC_MAC)) + if (FWINV(!ether_addr_equal_masked(mp, info->smaddr, + info->smmsk), + EBT_ARP_SRC_MAC)) return false; } @@ -88,11 +85,9 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) sizeof(_mac), &_mac); if (mp == NULL) return false; - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (mp[i] ^ info->dmaddr[i]) & - info->dmmsk[i]; - if (FWINV(verdict != 0, EBT_ARP_DST_MAC)) + if (FWINV(!ether_addr_equal_masked(mp, info->dmaddr, + info->dmmsk), + EBT_ARP_DST_MAC)) return false; } } diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index e77f90bf8db3..45f73d55422f 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -46,7 +46,6 @@ static bool ebt_filter_config(const struct ebt_stp_info *info, const struct ebt_stp_config_info *c; u16 v16; u32 v32; - int verdict, i; c = &info->config; if ((info->bitmask & EBT_STP_FLAGS) && @@ -54,66 +53,62 @@ static bool ebt_filter_config(const struct ebt_stp_info *info, return false; if (info->bitmask & EBT_STP_ROOTPRIO) { v16 = NR16(stpc->root); - if (FWINV(v16 < c->root_priol || - v16 > c->root_priou, EBT_STP_ROOTPRIO)) + if (FWINV(v16 < c->root_priol || v16 > c->root_priou, + EBT_STP_ROOTPRIO)) return false; } if (info->bitmask & EBT_STP_ROOTADDR) { - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (stpc->root[2+i] ^ c->root_addr[i]) & - c->root_addrmsk[i]; - if (FWINV(verdict != 0, EBT_STP_ROOTADDR)) + if (FWINV(!ether_addr_equal_masked(&stpc->root[2], c->root_addr, + c->root_addrmsk), + EBT_STP_ROOTADDR)) return false; } if (info->bitmask & EBT_STP_ROOTCOST) { v32 = NR32(stpc->root_cost); - if (FWINV(v32 < c->root_costl || - v32 > c->root_costu, EBT_STP_ROOTCOST)) + if (FWINV(v32 < c->root_costl || v32 > c->root_costu, + EBT_STP_ROOTCOST)) return false; } if (info->bitmask & EBT_STP_SENDERPRIO) { v16 = NR16(stpc->sender); - if (FWINV(v16 < c->sender_priol || - v16 > c->sender_priou, EBT_STP_SENDERPRIO)) + if (FWINV(v16 < c->sender_priol || v16 > c->sender_priou, + EBT_STP_SENDERPRIO)) return false; } if (info->bitmask & EBT_STP_SENDERADDR) { - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (stpc->sender[2+i] ^ c->sender_addr[i]) & - c->sender_addrmsk[i]; - if (FWINV(verdict != 0, EBT_STP_SENDERADDR)) + if (FWINV(!ether_addr_equal_masked(&stpc->sender[2], + c->sender_addr, + c->sender_addrmsk), + EBT_STP_SENDERADDR)) return false; } if (info->bitmask & EBT_STP_PORT) { v16 = NR16(stpc->port); - if (FWINV(v16 < c->portl || - v16 > c->portu, EBT_STP_PORT)) + if (FWINV(v16 < c->portl || v16 > c->portu, EBT_STP_PORT)) return false; } if (info->bitmask & EBT_STP_MSGAGE) { v16 = NR16(stpc->msg_age); - if (FWINV(v16 < c->msg_agel || - v16 > c->msg_ageu, EBT_STP_MSGAGE)) + if (FWINV(v16 < c->msg_agel || v16 > c->msg_ageu, + EBT_STP_MSGAGE)) return false; } if (info->bitmask & EBT_STP_MAXAGE) { v16 = NR16(stpc->max_age); - if (FWINV(v16 < c->max_agel || - v16 > c->max_ageu, EBT_STP_MAXAGE)) + if (FWINV(v16 < c->max_agel || v16 > c->max_ageu, + EBT_STP_MAXAGE)) return false; } if (info->bitmask & EBT_STP_HELLOTIME) { v16 = NR16(stpc->hello_time); - if (FWINV(v16 < c->hello_timel || - v16 > c->hello_timeu, EBT_STP_HELLOTIME)) + if (FWINV(v16 < c->hello_timel || v16 > c->hello_timeu, + EBT_STP_HELLOTIME)) return false; } if (info->bitmask & EBT_STP_FWDD) { v16 = NR16(stpc->forward_delay); - if (FWINV(v16 < c->forward_delayl || - v16 > c->forward_delayu, EBT_STP_FWDD)) + if (FWINV(v16 < c->forward_delayl || v16 > c->forward_delayu, + EBT_STP_FWDD)) return false; } return true; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5a61f35412a0..5721a25be860 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -130,7 +130,6 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, const struct ethhdr *h = eth_hdr(skb); const struct net_bridge_port *p; __be16 ethproto; - int verdict, i; if (skb_vlan_tag_present(skb)) ethproto = htons(ETH_P_8021Q); @@ -157,19 +156,15 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, return 1; if (e->bitmask & EBT_SOURCEMAC) { - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (h->h_source[i] ^ e->sourcemac[i]) & - e->sourcemsk[i]; - if (FWINV2(verdict != 0, EBT_ISOURCE)) + if (FWINV2(!ether_addr_equal_masked(h->h_source, + e->sourcemac, e->sourcemsk), + EBT_ISOURCE)) return 1; } if (e->bitmask & EBT_DESTMAC) { - verdict = 0; - for (i = 0; i < 6; i++) - verdict |= (h->h_dest[i] ^ e->destmac[i]) & - e->destmsk[i]; - if (FWINV2(verdict != 0, EBT_IDEST)) + if (FWINV2(!ether_addr_equal_masked(h->h_dest, + e->destmac, e->destmsk), + EBT_IDEST)) return 1; } return 0; -- cgit v1.2.3 From f1504307b9ab60e73ba31eece4be8298ebc9c1b7 Mon Sep 17 00:00:00 2001 From: Moritz Sichert Date: Thu, 30 Jun 2016 11:46:28 +0200 Subject: netfilter: Remove references to obsolete CONFIG_IP_ROUTE_FWMARK This option was removed in commit 47dcf0cb1005 ("[NET]: Rethink mark field in struct flowi"). Signed-off-by: Moritz Sichert Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 95e757c377f9..9266ceebd112 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -609,9 +609,8 @@ config NETFILTER_XT_MARK The target allows you to create rules in the "mangle" table which alter the netfilter mark (nfmark) field associated with the packet. - Prior to routing, the nfmark can influence the routing method (see - "Use netfilter MARK value as routing key") and can also be used by - other subsystems to change their behavior. + Prior to routing, the nfmark can influence the routing method and can + also be used by other subsystems to change their behavior. config NETFILTER_XT_CONNMARK tristate 'ctmark target and match support' @@ -753,9 +752,8 @@ config NETFILTER_XT_TARGET_HMARK The target allows you to create rules in the "raw" and "mangle" tables which set the skbuff mark by means of hash calculation within a given - range. The nfmark can influence the routing method (see "Use netfilter - MARK value as routing key") and can also be used by other subsystems to - change their behaviour. + range. The nfmark can influence the routing method and can also be used + by other subsystems to change their behaviour. To compile it as a module, choose M here. If unsure, say N. -- cgit v1.2.3 From 113214be7f6c98dd6d0435e4765aea8dea91662c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 30 Jun 2016 17:24:44 +0200 Subject: bpf: refactor bpf_prog_get and type check into helper Since bpf_prog_get() and program type check is used in a couple of places, refactor this into a small helper function that we can make use of. Since the non RO prog->aux part is not used in performance critical paths and a program destruction via RCU is rather very unlikley when doing the put, we shouldn't have an issue just doing the bpf_prog_get() + prog->type != type check, but actually not taking the ref at all (due to being in fdget() / fdput() section of the bpf fd) is even cleaner and makes the diff smaller as well, so just go for that. Callsites are changed to make use of the new helper where possible. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 13 +------------ net/kcm/kcmsock.c | 8 +------- net/packet/af_packet.c | 6 +----- net/sched/act_bpf.c | 7 +------ net/sched/cls_bpf.c | 7 +------ 5 files changed, 5 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 76f9a4938be4..76fee35da244 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1301,21 +1301,10 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) { - struct bpf_prog *prog; - if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); - prog = bpf_prog_get(ufd); - if (IS_ERR(prog)) - return prog; - - if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { - bpf_prog_put(prog); - return ERR_PTR(-EINVAL); - } - - return prog; + return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); } int sk_attach_bpf(u32 ufd, struct sock *sk) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 0b68ba730a06..cb39e05b166c 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1765,18 +1765,12 @@ static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) if (!csock) return -ENOENT; - prog = bpf_prog_get(info->bpf_fd); + prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out; } - if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { - bpf_prog_put(prog); - err = -EINVAL; - goto out; - } - err = kcm_attach(sock, csock, prog); if (err) { bpf_prog_put(prog); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d1f3b9e977e5..48b58957adf4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1588,13 +1588,9 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, if (copy_from_user(&fd, data, len)) return -EFAULT; - new = bpf_prog_get(fd); + new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(new)) return PTR_ERR(new); - if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) { - bpf_prog_put(new); - return -EINVAL; - } __fanout_set_data_bpf(po->fanout, new); return 0; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index f7b6cf49ea6f..ef74bffa6101 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -223,15 +223,10 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]); - fp = bpf_prog_get(bpf_fd); + fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_ACT); if (IS_ERR(fp)) return PTR_ERR(fp); - if (fp->type != BPF_PROG_TYPE_SCHED_ACT) { - bpf_prog_put(fp); - return -EINVAL; - } - if (tb[TCA_ACT_BPF_NAME]) { name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]), nla_len(tb[TCA_ACT_BPF_NAME]), diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 7b342c779da7..c3002c2c68bb 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -272,15 +272,10 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); - fp = bpf_prog_get(bpf_fd); + fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); if (IS_ERR(fp)) return PTR_ERR(fp); - if (fp->type != BPF_PROG_TYPE_SCHED_CLS) { - bpf_prog_put(fp); - return -EINVAL; - } - if (tb[TCA_BPF_NAME]) { name = kmemdup(nla_data(tb[TCA_BPF_NAME]), nla_len(tb[TCA_BPF_NAME]), -- cgit v1.2.3 From 4a482f34afcc162d8456f449b137ec2a95be60d8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 30 Jun 2016 10:28:44 -0700 Subject: cgroup: bpf: Add bpf_skb_in_cgroup_proto Adds a bpf helper, bpf_skb_in_cgroup, to decide if a skb->sk belongs to a descendant of a cgroup2. It is similar to the feature added in netfilter: commit c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match") The user is expected to populate a BPF_MAP_TYPE_CGROUP_ARRAY which will be used by the bpf_skb_in_cgroup. Modifications to the bpf verifier is to ensure BPF_MAP_TYPE_CGROUP_ARRAY and bpf_skb_in_cgroup() are always used together. Signed-off-by: Martin KaFai Lau Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Tejun Heo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 76fee35da244..54071cf70fb5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2239,6 +2239,40 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } } +#ifdef CONFIG_SOCK_CGROUP_DATA +static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *)(long)r1; + struct bpf_map *map = (struct bpf_map *)(long)r2; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + struct sock *sk; + u32 i = (u32)r3; + + sk = skb->sk; + if (!sk || !sk_fullsock(sk)) + return -ENOENT; + + if (unlikely(i >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[i]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp); +} + +static const struct bpf_func_proto bpf_skb_in_cgroup_proto = { + .func = bpf_skb_in_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; +#endif + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -2307,6 +2341,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return bpf_get_event_output_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_in_cgroup: + return &bpf_skb_in_cgroup_proto; +#endif default: return sk_filter_func_proto(func_id); } -- cgit v1.2.3 From 226f7a7d97e37220a442f52eb85ebff2cd6fc0d2 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Jun 2016 16:11:10 -0700 Subject: RDS: Rework path specific indirections Refactor code to avoid separate indirections for single-path and multipath transports. All transports (both single and mp-capable) will get a pointer to the rds_conn_path, and can trivially derive the rds_connection from the ->cp_conn. Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 5 +---- net/rds/ib.c | 4 ++-- net/rds/ib.h | 4 ++-- net/rds/ib_cm.c | 3 ++- net/rds/ib_send.c | 3 ++- net/rds/loop.c | 4 ++-- net/rds/rds.h | 3 --- net/rds/send.c | 16 ++++------------ net/rds/tcp.c | 6 +++--- net/rds/tcp.h | 6 +++--- net/rds/tcp_connect.c | 7 ++++--- net/rds/tcp_send.c | 8 ++++---- 12 files changed, 29 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index a4b07c899d89..17c2f2591ac4 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -326,10 +326,7 @@ void rds_conn_shutdown(struct rds_conn_path *cp) wait_event(cp->cp_waitq, !test_bit(RDS_RECV_REFILL, &cp->cp_flags)); - if (!conn->c_trans->t_mp_capable) - conn->c_trans->conn_shutdown(conn); - else - conn->c_trans->conn_path_shutdown(cp); + conn->c_trans->conn_path_shutdown(cp); rds_conn_path_reset(cp); if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING, diff --git a/net/rds/ib.c b/net/rds/ib.c index 44946a681a8c..1b29ec9445fc 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -381,7 +381,7 @@ void rds_ib_exit(void) struct rds_transport rds_ib_transport = { .laddr_check = rds_ib_laddr_check, - .xmit_complete = rds_ib_xmit_complete, + .xmit_path_complete = rds_ib_xmit_path_complete, .xmit = rds_ib_xmit, .xmit_rdma = rds_ib_xmit_rdma, .xmit_atomic = rds_ib_xmit_atomic, @@ -389,7 +389,7 @@ struct rds_transport rds_ib_transport = { .conn_alloc = rds_ib_conn_alloc, .conn_free = rds_ib_conn_free, .conn_connect = rds_ib_conn_connect, - .conn_shutdown = rds_ib_conn_shutdown, + .conn_path_shutdown = rds_ib_conn_path_shutdown, .inc_copy_to_user = rds_ib_inc_copy_to_user, .inc_free = rds_ib_inc_free, .cm_initiate_connect = rds_ib_cm_initiate_connect, diff --git a/net/rds/ib.h b/net/rds/ib.h index 627fb79aee65..2051f4bd7a66 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -329,7 +329,7 @@ extern struct list_head ib_nodev_conns; int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); void rds_ib_conn_free(void *arg); int rds_ib_conn_connect(struct rds_connection *conn); -void rds_ib_conn_shutdown(struct rds_connection *conn); +void rds_ib_conn_path_shutdown(struct rds_conn_path *cp); void rds_ib_state_change(struct sock *sk); int rds_ib_listen_init(void); void rds_ib_listen_stop(void); @@ -384,7 +384,7 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); extern wait_queue_head_t rds_ib_ring_empty_wait; /* ib_send.c */ -void rds_ib_xmit_complete(struct rds_connection *conn); +void rds_ib_xmit_path_complete(struct rds_conn_path *cp); int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index e48bb1ba3dfc..e34ea0b5c16a 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -731,8 +731,9 @@ out: * so that it can be called at any point during startup. In fact it * can be called multiple times for a given connection. */ -void rds_ib_conn_shutdown(struct rds_connection *conn) +void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; int err = 0; diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 6e4110aa5135..84d90c97332f 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -980,8 +980,9 @@ out: return ret; } -void rds_ib_xmit_complete(struct rds_connection *conn) +void rds_ib_xmit_path_complete(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; /* We may have a pending ACK or window update we were unable diff --git a/net/rds/loop.c b/net/rds/loop.c index 15f83db78f0c..318c21d7d8d4 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -156,7 +156,7 @@ static int rds_loop_conn_connect(struct rds_connection *conn) return 0; } -static void rds_loop_conn_shutdown(struct rds_connection *conn) +static void rds_loop_conn_path_shutdown(struct rds_conn_path *cp) { } @@ -189,7 +189,7 @@ struct rds_transport rds_loop_transport = { .conn_alloc = rds_loop_conn_alloc, .conn_free = rds_loop_conn_free, .conn_connect = rds_loop_conn_connect, - .conn_shutdown = rds_loop_conn_shutdown, + .conn_path_shutdown = rds_loop_conn_path_shutdown, .inc_copy_to_user = rds_message_inc_copy_to_user, .inc_free = rds_loop_inc_free, .t_name = "loopback", diff --git a/net/rds/rds.h b/net/rds/rds.h index 2e35b738176f..5bbad08262f5 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -455,11 +455,8 @@ struct rds_transport { int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); int (*conn_connect)(struct rds_connection *conn); - void (*conn_shutdown)(struct rds_connection *conn); void (*conn_path_shutdown)(struct rds_conn_path *conn); - void (*xmit_prepare)(struct rds_connection *conn); void (*xmit_path_prepare)(struct rds_conn_path *cp); - void (*xmit_complete)(struct rds_connection *conn); void (*xmit_path_complete)(struct rds_conn_path *cp); int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); diff --git a/net/rds/send.c b/net/rds/send.c index ee43d6b2ea8f..5a9caf1da896 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -183,12 +183,8 @@ restart: goto out; } - if (conn->c_trans->t_mp_capable) { - if (conn->c_trans->xmit_path_prepare) - conn->c_trans->xmit_path_prepare(cp); - } else if (conn->c_trans->xmit_prepare) { - conn->c_trans->xmit_prepare(conn); - } + if (conn->c_trans->xmit_path_prepare) + conn->c_trans->xmit_path_prepare(cp); /* * spin trying to push headers and data down the connection until @@ -403,12 +399,8 @@ restart: } over_batch: - if (conn->c_trans->t_mp_capable) { - if (conn->c_trans->xmit_path_complete) - conn->c_trans->xmit_path_complete(cp); - } else if (conn->c_trans->xmit_complete) { - conn->c_trans->xmit_complete(conn); - } + if (conn->c_trans->xmit_path_complete) + conn->c_trans->xmit_path_complete(cp); release_in_xmit(cp); /* Nuke any messages we decided not to retransmit. */ diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 5217d49ce6d6..b139630daaa4 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -340,14 +340,14 @@ static void rds_tcp_exit(void); struct rds_transport rds_tcp_transport = { .laddr_check = rds_tcp_laddr_check, - .xmit_prepare = rds_tcp_xmit_prepare, - .xmit_complete = rds_tcp_xmit_complete, + .xmit_path_prepare = rds_tcp_xmit_path_prepare, + .xmit_path_complete = rds_tcp_xmit_path_complete, .xmit = rds_tcp_xmit, .recv = rds_tcp_recv, .conn_alloc = rds_tcp_conn_alloc, .conn_free = rds_tcp_conn_free, .conn_connect = rds_tcp_conn_connect, - .conn_shutdown = rds_tcp_conn_shutdown, + .conn_path_shutdown = rds_tcp_conn_path_shutdown, .inc_copy_to_user = rds_tcp_inc_copy_to_user, .inc_free = rds_tcp_inc_free, .stats_info_copy = rds_tcp_stats_info_copy, diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 7940babf6c71..728abe22c9a3 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -61,7 +61,7 @@ void rds_tcp_accept_work(struct sock *sk); /* tcp_connect.c */ int rds_tcp_conn_connect(struct rds_connection *conn); -void rds_tcp_conn_shutdown(struct rds_connection *conn); +void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ @@ -80,8 +80,8 @@ void rds_tcp_inc_free(struct rds_incoming *inc); int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); /* tcp_send.c */ -void rds_tcp_xmit_prepare(struct rds_connection *conn); -void rds_tcp_xmit_complete(struct rds_connection *conn); +void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp); +void rds_tcp_xmit_path_complete(struct rds_conn_path *cp); int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); void rds_tcp_write_space(struct sock *sk); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 96c2c4d17909..aa65c1631c4b 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -144,12 +144,13 @@ out: * callbacks to those set by TCP. Our callbacks won't execute again once we * hold the sock lock. */ -void rds_tcp_conn_shutdown(struct rds_connection *conn) +void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; struct socket *sock = tc->t_sock; - rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock); + rdsdebug("shutting down conn %p tc %p sock %p\n", + cp->cp_conn, tc, sock); if (sock) { sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 710f1aae97ad..52cda947457b 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -49,16 +49,16 @@ static void rds_tcp_cork(struct socket *sock, int val) set_fs(oldfs); } -void rds_tcp_xmit_prepare(struct rds_connection *conn) +void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; rds_tcp_cork(tc->t_sock, 1); } -void rds_tcp_xmit_complete(struct rds_connection *conn) +void rds_tcp_xmit_path_complete(struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; rds_tcp_cork(tc->t_sock, 0); } -- cgit v1.2.3 From 26e4e6bb683028546f339018ab4cd394300a92a4 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Jun 2016 16:11:11 -0700 Subject: RDS: TCP: Remove dead logic around c_passive in rds-tcp The c_passive bit is only intended for the IB transport and will never be encountered in rds-tcp, so remove the dead logic that predicates on this bit. Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index b139630daaa4..c56fff28084f 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -329,11 +329,8 @@ static void rds_tcp_destroy_conns(void) INIT_LIST_HEAD(&rds_tcp_conn_list); spin_unlock_irq(&rds_tcp_conn_lock); - list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { - if (tc->conn->c_passive) - rds_conn_destroy(tc->conn->c_passive); + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) rds_conn_destroy(tc->conn); - } } static void rds_tcp_exit(void); @@ -512,8 +509,6 @@ static void rds_tcp_kill_sock(struct net *net) sk = tc->t_sock->sk; sk->sk_prot->disconnect(sk, 0); tcp_done(sk); - if (tc->conn->c_passive) - rds_conn_destroy(tc->conn->c_passive); rds_conn_destroy(tc->conn); } } -- cgit v1.2.3 From 02105b2ccdd6344146e0296172a9e0f17ff624ef Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Jun 2016 16:11:12 -0700 Subject: RDS: TCP: Make rds_tcp_connection track the rds_conn_path The struct rds_tcp_connection is the transport-specific private data structure that tracks TCP information per rds_conn_path. Modify this structure to have a back-pointer to the rds_conn_path for which it is the ->cp_transport_data. Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 30 +++++++++++++++--------------- net/rds/tcp.c | 44 +++++++++++++++++++++++++------------------- net/rds/tcp.h | 6 +++--- net/rds/tcp_connect.c | 6 +++--- net/rds/tcp_listen.c | 4 ++-- 5 files changed, 48 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 17c2f2591ac4..1b0c2a783b5e 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -253,9 +253,12 @@ static struct rds_connection *__rds_conn_create(struct net *net, for (i = 0; i < RDS_MPATH_WORKERS; i++) { cp = &conn->c_path[i]; - trans->conn_free(cp->cp_transport_data); - if (!trans->t_mp_capable) - break; + /* The ->conn_alloc invocation may have + * allocated resource for all paths, so all + * of them may have to be freed here. + */ + if (cp->cp_transport_data) + trans->conn_free(cp->cp_transport_data); } kmem_cache_free(rds_conn_slab, conn); conn = found; @@ -367,6 +370,9 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) { struct rds_message *rm, *rtmp; + if (!cp->cp_transport_data) + return; + rds_conn_path_drop(cp); flush_work(&cp->cp_down_w); @@ -398,6 +404,8 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) void rds_conn_destroy(struct rds_connection *conn) { unsigned long flags; + int i; + struct rds_conn_path *cp; rdsdebug("freeing conn %p for %pI4 -> " "%pI4\n", conn, &conn->c_laddr, @@ -410,18 +418,10 @@ void rds_conn_destroy(struct rds_connection *conn) synchronize_rcu(); /* shut the connection down */ - if (!conn->c_trans->t_mp_capable) { - rds_conn_path_destroy(&conn->c_path[0]); - BUG_ON(!list_empty(&conn->c_path[0].cp_retrans)); - } else { - int i; - struct rds_conn_path *cp; - - for (i = 0; i < RDS_MPATH_WORKERS; i++) { - cp = &conn->c_path[i]; - rds_conn_path_destroy(cp); - BUG_ON(!list_empty(&cp->cp_retrans)); - } + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + cp = &conn->c_path[i]; + rds_conn_path_destroy(cp); + BUG_ON(!list_empty(&cp->cp_retrans)); } /* diff --git a/net/rds/tcp.c b/net/rds/tcp.c index c56fff28084f..c6b47f670990 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -221,7 +221,7 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) sock->sk->sk_data_ready = sock->sk->sk_user_data; tc->t_sock = sock; - tc->conn = conn; + tc->t_cpath = &conn->c_path[0]; tc->t_orig_data_ready = sock->sk->sk_data_ready; tc->t_orig_write_space = sock->sk->sk_write_space; tc->t_orig_state_change = sock->sk->sk_state_change; @@ -284,24 +284,29 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr) static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) { struct rds_tcp_connection *tc; + int i; - tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); - if (!tc) - return -ENOMEM; + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); + if (!tc) + return -ENOMEM; - mutex_init(&tc->t_conn_lock); - tc->t_sock = NULL; - tc->t_tinc = NULL; - tc->t_tinc_hdr_rem = sizeof(struct rds_header); - tc->t_tinc_data_rem = 0; + mutex_init(&tc->t_conn_path_lock); + tc->t_sock = NULL; + tc->t_tinc = NULL; + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; - conn->c_transport_data = tc; + conn->c_path[i].cp_transport_data = tc; + tc->t_cpath = &conn->c_path[i]; - spin_lock_irq(&rds_tcp_conn_lock); - list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); - spin_unlock_irq(&rds_tcp_conn_lock); + spin_lock_irq(&rds_tcp_conn_lock); + list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); + spin_unlock_irq(&rds_tcp_conn_lock); + rdsdebug("rds_conn_path [%d] tc %p\n", i, + conn->c_path[i].cp_transport_data); + } - rdsdebug("alloced tc %p\n", conn->c_transport_data); return 0; } @@ -330,7 +335,7 @@ static void rds_tcp_destroy_conns(void) spin_unlock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) - rds_conn_destroy(tc->conn); + rds_conn_destroy(tc->t_cpath->cp_conn); } static void rds_tcp_exit(void); @@ -498,7 +503,7 @@ static void rds_tcp_kill_sock(struct net *net) flush_work(&rtn->rds_tcp_accept_w); spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = read_pnet(&tc->conn->c_net); + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; @@ -509,7 +514,7 @@ static void rds_tcp_kill_sock(struct net *net) sk = tc->t_sock->sk; sk->sk_prot->disconnect(sk, 0); tcp_done(sk); - rds_conn_destroy(tc->conn); + rds_conn_destroy(tc->t_cpath->cp_conn); } } @@ -547,12 +552,13 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = read_pnet(&tc->conn->c_net); + struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); if (net != c_net || !tc->t_sock) continue; - rds_conn_drop(tc->conn); /* reconnect with new parameters */ + /* reconnect with new parameters */ + rds_conn_path_drop(tc->t_cpath); } spin_unlock_irq(&rds_tcp_conn_lock); } diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 728abe22c9a3..e1ff16908c5e 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -11,11 +11,11 @@ struct rds_tcp_incoming { struct rds_tcp_connection { struct list_head t_tcp_node; - struct rds_connection *conn; - /* t_conn_lock synchronizes the connection establishment between + struct rds_conn_path *t_cpath; + /* t_conn_path_lock synchronizes the connection establishment between * rds_tcp_accept_one and rds_tcp_conn_connect */ - struct mutex t_conn_lock; + struct mutex t_conn_path_lock; struct socket *t_sock; void *t_orig_write_space; void *t_orig_data_ready; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index aa65c1631c4b..146692c8afac 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -82,10 +82,10 @@ int rds_tcp_conn_connect(struct rds_connection *conn) int ret; struct rds_tcp_connection *tc = conn->c_transport_data; - mutex_lock(&tc->t_conn_lock); + mutex_lock(&tc->t_conn_path_lock); if (rds_conn_up(conn)) { - mutex_unlock(&tc->t_conn_lock); + mutex_unlock(&tc->t_conn_path_lock); return 0; } ret = sock_create_kern(rds_conn_net(conn), PF_INET, @@ -129,7 +129,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn) } out: - mutex_unlock(&tc->t_conn_lock); + mutex_unlock(&tc->t_conn_path_lock); if (sock) sock_release(sock); return ret; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index f9cc945a77b3..d8933469ab13 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -121,7 +121,7 @@ int rds_tcp_accept_one(struct socket *sock) */ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); - mutex_lock(&rs_tcp->t_conn_lock); + mutex_lock(&rs_tcp->t_conn_path_lock); conn_state = rds_conn_state(conn); if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP) goto rst_nsk; @@ -156,7 +156,7 @@ rst_nsk: ret = 0; out: if (rs_tcp) - mutex_unlock(&rs_tcp->t_conn_lock); + mutex_unlock(&rs_tcp->t_conn_path_lock); if (new_sock) sock_release(new_sock); return ret; -- cgit v1.2.3 From afb4164d91c7486a1d4ab098a1b88e27b5e25772 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Jun 2016 16:11:13 -0700 Subject: RDS: TCP: Refactor connection destruction to handle multiple paths A single rds_connection may have multiple rds_conn_paths that have to be carefully and correctly destroyed, for both rmmod and netns-delete cases. For both cases, we extract a single rds_tcp_connection for each conn into a temporary list, and then invoke rds_conn_destroy() which iteratively dismantles every path in the rds_connection. For the netns deletion case, we additionally have to make sure that we do not leave a socket in TIME_WAIT state, as this will hold up the netns deletion. Thus we call rds_tcp_conn_paths_destroy() to reset state quickly. Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp.c | 46 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index c6b47f670990..b32772759c9d 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -323,6 +323,17 @@ static void rds_tcp_conn_free(void *arg) kmem_cache_free(rds_tcp_conn_slab, tc); } +static bool list_has_conn(struct list_head *list, struct rds_connection *conn) +{ + struct rds_tcp_connection *tc, *_tc; + + list_for_each_entry_safe(tc, _tc, list, t_tcp_node) { + if (tc->t_cpath->cp_conn == conn) + return true; + } + return false; +} + static void rds_tcp_destroy_conns(void) { struct rds_tcp_connection *tc, *_tc; @@ -330,8 +341,10 @@ static void rds_tcp_destroy_conns(void) /* avoid calling conn_destroy with irqs off */ spin_lock_irq(&rds_tcp_conn_lock); - list_splice(&rds_tcp_conn_list, &tmp_list); - INIT_LIST_HEAD(&rds_tcp_conn_list); + list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { + if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) + list_move_tail(&tc->t_tcp_node, &tmp_list); + } spin_unlock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) @@ -491,10 +504,30 @@ static struct pernet_operations rds_tcp_net_ops = { .size = sizeof(struct rds_tcp_net), }; +/* explicitly send a RST on each socket, thereby releasing any socket refcnts + * that may otherwise hold up netns deletion. + */ +static void rds_tcp_conn_paths_destroy(struct rds_connection *conn) +{ + struct rds_conn_path *cp; + struct rds_tcp_connection *tc; + int i; + struct sock *sk; + + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + cp = &conn->c_path[i]; + tc = cp->cp_transport_data; + if (!tc->t_sock) + continue; + sk = tc->t_sock->sk; + sk->sk_prot->disconnect(sk, 0); + tcp_done(sk); + } +} + static void rds_tcp_kill_sock(struct net *net) { struct rds_tcp_connection *tc, *_tc; - struct sock *sk; LIST_HEAD(tmp_list); struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); @@ -507,13 +540,12 @@ static void rds_tcp_kill_sock(struct net *net) if (net != c_net || !tc->t_sock) continue; - list_move_tail(&tc->t_tcp_node, &tmp_list); + if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) + list_move_tail(&tc->t_tcp_node, &tmp_list); } spin_unlock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { - sk = tc->t_sock->sk; - sk->sk_prot->disconnect(sk, 0); - tcp_done(sk); + rds_tcp_conn_paths_destroy(tc->t_cpath->cp_conn); rds_conn_destroy(tc->t_cpath->cp_conn); } } -- cgit v1.2.3 From ea3b1ea53930879c9847044f5cb9c97411cae797 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Jun 2016 16:11:14 -0700 Subject: RDS: TCP: make ->sk_user_data point to a rds_conn_path The socket callbacks should all operate on a struct rds_conn_path, in preparation for a MP capable RDS-TCP. Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp.c | 25 +++++++++++++------------ net/rds/tcp.h | 4 ++-- net/rds/tcp_connect.c | 16 ++++++++-------- net/rds/tcp_listen.c | 12 ++++++------ net/rds/tcp_recv.c | 12 ++++++------ net/rds/tcp_send.c | 12 ++++++------ 6 files changed, 41 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index b32772759c9d..5658f3e9f601 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -136,9 +136,9 @@ void rds_tcp_restore_callbacks(struct socket *sock, * from being called while it isn't set. */ void rds_tcp_reset_callbacks(struct socket *sock, - struct rds_connection *conn) + struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; struct socket *osock = tc->t_sock; if (!osock) @@ -148,8 +148,8 @@ void rds_tcp_reset_callbacks(struct socket *sock, * We have an outstanding SYN to this peer, which may * potentially have transitioned to the RDS_CONN_UP state, * so we must quiesce any send threads before resetting - * c_transport_data. We quiesce these threads by setting - * c_state to something other than RDS_CONN_UP, and then + * cp_transport_data. We quiesce these threads by setting + * cp_state to something other than RDS_CONN_UP, and then * waiting for any existing threads in rds_send_xmit to * complete release_in_xmit(). (Subsequent threads entering * rds_send_xmit() will bail on !rds_conn_up(). @@ -164,8 +164,8 @@ void rds_tcp_reset_callbacks(struct socket *sock, * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change * cannot mark rds_conn_path_up() in the window before lock_sock() */ - atomic_set(&conn->c_state, RDS_CONN_RESETTING); - wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags)); + atomic_set(&cp->cp_state, RDS_CONN_RESETTING); + wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags)); lock_sock(osock->sk); /* reset receive side state for rds_tcp_data_recv() for osock */ if (tc->t_tinc) { @@ -186,11 +186,12 @@ void rds_tcp_reset_callbacks(struct socket *sock, release_sock(osock->sk); sock_release(osock); newsock: - rds_send_path_reset(&conn->c_path[0]); + rds_send_path_reset(cp); lock_sock(sock->sk); write_lock_bh(&sock->sk->sk_callback_lock); tc->t_sock = sock; - sock->sk->sk_user_data = conn; + tc->t_cpath = cp; + sock->sk->sk_user_data = cp; sock->sk->sk_data_ready = rds_tcp_data_ready; sock->sk->sk_write_space = rds_tcp_write_space; sock->sk->sk_state_change = rds_tcp_state_change; @@ -203,9 +204,9 @@ newsock: * above rds_tcp_reset_callbacks for notes about synchronization * with data path */ -void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) +void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc); write_lock_bh(&sock->sk->sk_callback_lock); @@ -221,12 +222,12 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) sock->sk->sk_data_ready = sock->sk->sk_user_data; tc->t_sock = sock; - tc->t_cpath = &conn->c_path[0]; + tc->t_cpath = cp; tc->t_orig_data_ready = sock->sk->sk_data_ready; tc->t_orig_write_space = sock->sk->sk_write_space; tc->t_orig_state_change = sock->sk->sk_state_change; - sock->sk->sk_user_data = conn; + sock->sk->sk_user_data = cp; sock->sk->sk_data_ready = rds_tcp_data_ready; sock->sk->sk_write_space = rds_tcp_write_space; sock->sk->sk_state_change = rds_tcp_state_change; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index e1ff16908c5e..151b09d1af43 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -49,8 +49,8 @@ struct rds_tcp_statistics { /* tcp.c */ void rds_tcp_tune(struct socket *sock); void rds_tcp_nonagle(struct socket *sock); -void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); -void rds_tcp_reset_callbacks(struct socket *sock, struct rds_connection *conn); +void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); +void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_restore_callbacks(struct socket *sock, struct rds_tcp_connection *tc); u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 146692c8afac..7eddce50e7a3 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -41,16 +41,16 @@ void rds_tcp_state_change(struct sock *sk) { void (*state_change)(struct sock *sk); - struct rds_connection *conn; + struct rds_conn_path *cp; struct rds_tcp_connection *tc; read_lock_bh(&sk->sk_callback_lock); - conn = sk->sk_user_data; - if (!conn) { + cp = sk->sk_user_data; + if (!cp) { state_change = sk->sk_state_change; goto out; } - tc = conn->c_transport_data; + tc = cp->cp_transport_data; state_change = tc->t_orig_state_change; rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); @@ -61,12 +61,11 @@ void rds_tcp_state_change(struct sock *sk) case TCP_SYN_RECV: break; case TCP_ESTABLISHED: - rds_connect_path_complete(&conn->c_path[0], - RDS_CONN_CONNECTING); + rds_connect_path_complete(cp, RDS_CONN_CONNECTING); break; case TCP_CLOSE_WAIT: case TCP_CLOSE: - rds_conn_drop(conn); + rds_conn_path_drop(cp); default: break; } @@ -81,6 +80,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn) struct sockaddr_in src, dest; int ret; struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_conn_path *cp = &conn->c_path[0]; mutex_lock(&tc->t_conn_path_lock); @@ -114,7 +114,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn) * once we call connect() we can start getting callbacks and they * own the socket */ - rds_tcp_set_callbacks(sock, conn); + rds_tcp_set_callbacks(sock, cp); ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), O_NONBLOCK); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index d8933469ab13..ca975a217a49 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -79,6 +79,7 @@ int rds_tcp_accept_one(struct socket *sock) struct inet_sock *inet; struct rds_tcp_connection *rs_tcp = NULL; int conn_state; + struct rds_conn_path *cp; if (!sock) /* module unload or netns delete in progress */ return -ENETUNREACH; @@ -120,6 +121,7 @@ int rds_tcp_accept_one(struct socket *sock) * rds_tcp_state_change() will do that cleanup */ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; + cp = &conn->c_path[0]; rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); mutex_lock(&rs_tcp->t_conn_path_lock); conn_state = rds_conn_state(conn); @@ -136,16 +138,14 @@ int rds_tcp_accept_one(struct socket *sock) !conn->c_path[0].cp_outgoing) { goto rst_nsk; } else { - rds_tcp_reset_callbacks(new_sock, conn); + rds_tcp_reset_callbacks(new_sock, cp); conn->c_path[0].cp_outgoing = 0; /* rds_connect_path_complete() marks RDS_CONN_UP */ - rds_connect_path_complete(&conn->c_path[0], - RDS_CONN_RESETTING); + rds_connect_path_complete(cp, RDS_CONN_RESETTING); } } else { - rds_tcp_set_callbacks(new_sock, conn); - rds_connect_path_complete(&conn->c_path[0], - RDS_CONN_CONNECTING); + rds_tcp_set_callbacks(new_sock, cp); + rds_connect_path_complete(cp, RDS_CONN_CONNECTING); } new_sock = NULL; ret = 0; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 4a87d9ef3084..aa7a79a00ef7 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -297,24 +297,24 @@ int rds_tcp_recv(struct rds_connection *conn) void rds_tcp_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); - struct rds_connection *conn; + struct rds_conn_path *cp; struct rds_tcp_connection *tc; rdsdebug("data ready sk %p\n", sk); read_lock_bh(&sk->sk_callback_lock); - conn = sk->sk_user_data; - if (!conn) { /* check for teardown race */ + cp = sk->sk_user_data; + if (!cp) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } - tc = conn->c_transport_data; + tc = cp->cp_transport_data; ready = tc->t_orig_data_ready; rds_tcp_stats_inc(s_tcp_data_ready_calls); - if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM) - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + if (rds_tcp_read_sock(cp->cp_conn, GFP_ATOMIC) == -ENOMEM) + queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); out: read_unlock_bh(&sk->sk_callback_lock); ready(sk); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 52cda947457b..57e0f5826406 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -178,27 +178,27 @@ static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) void rds_tcp_write_space(struct sock *sk) { void (*write_space)(struct sock *sk); - struct rds_connection *conn; + struct rds_conn_path *cp; struct rds_tcp_connection *tc; read_lock_bh(&sk->sk_callback_lock); - conn = sk->sk_user_data; - if (!conn) { + cp = sk->sk_user_data; + if (!cp) { write_space = sk->sk_write_space; goto out; } - tc = conn->c_transport_data; + tc = cp->cp_transport_data; rdsdebug("write_space for tc %p\n", tc); write_space = tc->t_orig_write_space; rds_tcp_stats_inc(s_tcp_write_space_calls); rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc)); tc->t_last_seen_una = rds_tcp_snd_una(tc); - rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked); + rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked); if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(rds_wq, &cp->cp_send_w, 0); out: read_unlock_bh(&sk->sk_callback_lock); -- cgit v1.2.3 From 2da43c4a1b517d02e71d9611a2242273e7d399ba Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Jun 2016 16:11:15 -0700 Subject: RDS: TCP: make receive path use the rds_conn_path The ->sk_user_data contains a pointer to the rds_conn_path for the socket. Use this consistently in the rds_tcp_data_ready callbacks to get the rds_conn_path for rds_recv_incoming. Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/ib.c | 2 +- net/rds/ib.h | 2 +- net/rds/ib_recv.c | 3 ++- net/rds/loop.c | 4 ++-- net/rds/rds.h | 2 +- net/rds/tcp.c | 2 +- net/rds/tcp.h | 2 +- net/rds/tcp_recv.c | 29 ++++++++++++++++------------- net/rds/threads.c | 2 +- 9 files changed, 26 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 1b29ec9445fc..e6ba85671004 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -385,7 +385,7 @@ struct rds_transport rds_ib_transport = { .xmit = rds_ib_xmit, .xmit_rdma = rds_ib_xmit_rdma, .xmit_atomic = rds_ib_xmit_atomic, - .recv = rds_ib_recv, + .recv_path = rds_ib_recv_path, .conn_alloc = rds_ib_conn_alloc, .conn_free = rds_ib_conn_free, .conn_connect = rds_ib_conn_connect, diff --git a/net/rds/ib.h b/net/rds/ib.h index 2051f4bd7a66..579de7e6369c 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -354,7 +354,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); /* ib_recv.c */ int rds_ib_recv_init(void); void rds_ib_recv_exit(void); -int rds_ib_recv(struct rds_connection *conn); +int rds_ib_recv_path(struct rds_conn_path *conn); int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); void rds_ib_recv_free_caches(struct rds_ib_connection *ic); void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 4ea8cb17cc7a..606a11f681d2 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1009,8 +1009,9 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, rds_ib_recv_refill(conn, 0, GFP_NOWAIT); } -int rds_ib_recv(struct rds_connection *conn) +int rds_ib_recv_path(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; int ret = 0; diff --git a/net/rds/loop.c b/net/rds/loop.c index 318c21d7d8d4..20284a4dca91 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -102,7 +102,7 @@ static void rds_loop_inc_free(struct rds_incoming *inc) } /* we need to at least give the thread something to succeed */ -static int rds_loop_recv(struct rds_connection *conn) +static int rds_loop_recv_path(struct rds_conn_path *cp) { return 0; } @@ -185,7 +185,7 @@ void rds_loop_exit(void) */ struct rds_transport rds_loop_transport = { .xmit = rds_loop_xmit, - .recv = rds_loop_recv, + .recv_path = rds_loop_recv_path, .conn_alloc = rds_loop_conn_alloc, .conn_free = rds_loop_conn_free, .conn_connect = rds_loop_conn_connect, diff --git a/net/rds/rds.h b/net/rds/rds.h index 5bbad08262f5..0faca3011370 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -462,7 +462,7 @@ struct rds_transport { unsigned int hdr_off, unsigned int sg, unsigned int off); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); - int (*recv)(struct rds_connection *conn); + int (*recv_path)(struct rds_conn_path *cp); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to); void (*inc_free)(struct rds_incoming *inc); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 5658f3e9f601..7bc136c66dbe 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -359,7 +359,7 @@ struct rds_transport rds_tcp_transport = { .xmit_path_prepare = rds_tcp_xmit_path_prepare, .xmit_path_complete = rds_tcp_xmit_path_complete, .xmit = rds_tcp_xmit, - .recv = rds_tcp_recv, + .recv_path = rds_tcp_recv_path, .conn_alloc = rds_tcp_conn_alloc, .conn_free = rds_tcp_conn_free, .conn_connect = rds_tcp_conn_connect, diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 151b09d1af43..5a5f91abe1de 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -75,7 +75,7 @@ int rds_tcp_keepalive(struct socket *sock); int rds_tcp_recv_init(void); void rds_tcp_recv_exit(void); void rds_tcp_data_ready(struct sock *sk); -int rds_tcp_recv(struct rds_connection *conn); +int rds_tcp_recv_path(struct rds_conn_path *cp); void rds_tcp_inc_free(struct rds_incoming *inc); int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index aa7a79a00ef7..ad4892e97f91 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -34,7 +34,6 @@ #include #include -#include "rds_single_path.h" #include "rds.h" #include "tcp.h" @@ -148,7 +147,7 @@ static void rds_tcp_cong_recv(struct rds_connection *conn, } struct rds_tcp_desc_arg { - struct rds_connection *conn; + struct rds_conn_path *conn_path; gfp_t gfp; }; @@ -156,8 +155,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct rds_tcp_desc_arg *arg = desc->arg.data; - struct rds_connection *conn = arg->conn; - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_conn_path *cp = arg->conn_path; + struct rds_tcp_connection *tc = cp->cp_transport_data; struct rds_tcp_incoming *tinc = tc->t_tinc; struct sk_buff *clone; size_t left = len, to_copy; @@ -179,7 +178,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, } tc->t_tinc = tinc; rdsdebug("alloced tinc %p\n", tinc); - rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); + rds_inc_path_init(&tinc->ti_inc, cp, + cp->cp_conn->c_faddr); /* * XXX * we might be able to use the __ variants when * we've already serialized at a higher level. @@ -229,6 +229,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, } if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { + struct rds_connection *conn = cp->cp_conn; + if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) rds_tcp_cong_recv(conn, tinc); else @@ -251,15 +253,15 @@ out: } /* the caller has to hold the sock lock */ -static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp) +static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; struct socket *sock = tc->t_sock; read_descriptor_t desc; struct rds_tcp_desc_arg arg; /* It's like glib in the kernel! */ - arg.conn = conn; + arg.conn_path = cp; arg.gfp = gfp; desc.arg.data = &arg; desc.error = 0; @@ -279,16 +281,17 @@ static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp) * if we fail to allocate we're in trouble.. blindly wait some time before * trying again to see if the VM can free up something for us. */ -int rds_tcp_recv(struct rds_connection *conn) +int rds_tcp_recv_path(struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; struct socket *sock = tc->t_sock; int ret = 0; - rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); + rdsdebug("recv worker path [%d] tc %p sock %p\n", + cp->cp_index, tc, sock); lock_sock(sock->sk); - ret = rds_tcp_read_sock(conn, GFP_KERNEL); + ret = rds_tcp_read_sock(cp, GFP_KERNEL); release_sock(sock->sk); return ret; @@ -313,7 +316,7 @@ void rds_tcp_data_ready(struct sock *sk) ready = tc->t_orig_data_ready; rds_tcp_stats_inc(s_tcp_data_ready_calls); - if (rds_tcp_read_sock(cp->cp_conn, GFP_ATOMIC) == -ENOMEM) + if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); out: read_unlock_bh(&sk->sk_callback_lock); diff --git a/net/rds/threads.c b/net/rds/threads.c index 9fbe95bb14a9..f717b69e03f9 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -203,7 +203,7 @@ void rds_recv_worker(struct work_struct *work) int ret; if (rds_conn_path_state(cp) == RDS_CONN_UP) { - ret = cp->cp_conn->c_trans->recv(cp->cp_conn); + ret = cp->cp_conn->c_trans->recv_path(cp); rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); switch (ret) { case -EAGAIN: -- cgit v1.2.3 From b04e8554f7637999af8f54cca4dcfcf49f2ae7c8 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Jun 2016 16:11:16 -0700 Subject: RDS: TCP: Hooks to set up a single connection path This patch adds ->conn_path_connect callbacks in the rds_transport that are used to set up a single connection path. Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/ib.c | 2 +- net/rds/ib.h | 2 +- net/rds/ib_cm.c | 3 ++- net/rds/loop.c | 6 +++--- net/rds/rds.h | 2 +- net/rds/tcp.c | 2 +- net/rds/tcp.h | 4 ++-- net/rds/tcp_connect.c | 11 ++++++----- net/rds/threads.c | 5 +++-- 9 files changed, 20 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index e6ba85671004..7eaf887e46f8 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -388,7 +388,7 @@ struct rds_transport rds_ib_transport = { .recv_path = rds_ib_recv_path, .conn_alloc = rds_ib_conn_alloc, .conn_free = rds_ib_conn_free, - .conn_connect = rds_ib_conn_connect, + .conn_path_connect = rds_ib_conn_path_connect, .conn_path_shutdown = rds_ib_conn_path_shutdown, .inc_copy_to_user = rds_ib_inc_copy_to_user, .inc_free = rds_ib_inc_free, diff --git a/net/rds/ib.h b/net/rds/ib.h index 579de7e6369c..046f7508c06b 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -328,7 +328,7 @@ extern struct list_head ib_nodev_conns; /* ib_cm.c */ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); void rds_ib_conn_free(void *arg); -int rds_ib_conn_connect(struct rds_connection *conn); +int rds_ib_conn_path_connect(struct rds_conn_path *cp); void rds_ib_conn_path_shutdown(struct rds_conn_path *cp); void rds_ib_state_change(struct sock *sk); int rds_ib_listen_init(void); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index e34ea0b5c16a..5b2ab95afa07 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -685,8 +685,9 @@ out: return ret; } -int rds_ib_conn_connect(struct rds_connection *conn) +int rds_ib_conn_path_connect(struct rds_conn_path *cp) { + struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; struct sockaddr_in src, dest; int ret; diff --git a/net/rds/loop.c b/net/rds/loop.c index 20284a4dca91..f2bf78de5688 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -150,9 +150,9 @@ static void rds_loop_conn_free(void *arg) kfree(lc); } -static int rds_loop_conn_connect(struct rds_connection *conn) +static int rds_loop_conn_path_connect(struct rds_conn_path *cp) { - rds_connect_complete(conn); + rds_connect_complete(cp->cp_conn); return 0; } @@ -188,7 +188,7 @@ struct rds_transport rds_loop_transport = { .recv_path = rds_loop_recv_path, .conn_alloc = rds_loop_conn_alloc, .conn_free = rds_loop_conn_free, - .conn_connect = rds_loop_conn_connect, + .conn_path_connect = rds_loop_conn_path_connect, .conn_path_shutdown = rds_loop_conn_path_shutdown, .inc_copy_to_user = rds_message_inc_copy_to_user, .inc_free = rds_loop_inc_free, diff --git a/net/rds/rds.h b/net/rds/rds.h index 0faca3011370..6ef07bd27227 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -454,7 +454,7 @@ struct rds_transport { int (*laddr_check)(struct net *net, __be32 addr); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); - int (*conn_connect)(struct rds_connection *conn); + int (*conn_path_connect)(struct rds_conn_path *cp); void (*conn_path_shutdown)(struct rds_conn_path *conn); void (*xmit_path_prepare)(struct rds_conn_path *cp); void (*xmit_path_complete)(struct rds_conn_path *cp); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 7bc136c66dbe..d278432f080b 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -362,7 +362,7 @@ struct rds_transport rds_tcp_transport = { .recv_path = rds_tcp_recv_path, .conn_alloc = rds_tcp_conn_alloc, .conn_free = rds_tcp_conn_free, - .conn_connect = rds_tcp_conn_connect, + .conn_path_connect = rds_tcp_conn_path_connect, .conn_path_shutdown = rds_tcp_conn_path_shutdown, .inc_copy_to_user = rds_tcp_inc_copy_to_user, .inc_free = rds_tcp_inc_free, diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 5a5f91abe1de..1c3160faa963 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -13,7 +13,7 @@ struct rds_tcp_connection { struct list_head t_tcp_node; struct rds_conn_path *t_cpath; /* t_conn_path_lock synchronizes the connection establishment between - * rds_tcp_accept_one and rds_tcp_conn_connect + * rds_tcp_accept_one and rds_tcp_conn_path_connect */ struct mutex t_conn_path_lock; struct socket *t_sock; @@ -60,7 +60,7 @@ extern struct rds_transport rds_tcp_transport; void rds_tcp_accept_work(struct sock *sk); /* tcp_connect.c */ -int rds_tcp_conn_connect(struct rds_connection *conn); +int rds_tcp_conn_path_connect(struct rds_conn_path *cp); void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn); void rds_tcp_state_change(struct sock *sk); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 7eddce50e7a3..c916715fbe61 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -74,17 +74,17 @@ out: state_change(sk); } -int rds_tcp_conn_connect(struct rds_connection *conn) +int rds_tcp_conn_path_connect(struct rds_conn_path *cp) { struct socket *sock = NULL; struct sockaddr_in src, dest; int ret; - struct rds_tcp_connection *tc = conn->c_transport_data; - struct rds_conn_path *cp = &conn->c_path[0]; + struct rds_connection *conn = cp->cp_conn; + struct rds_tcp_connection *tc = cp->cp_transport_data; mutex_lock(&tc->t_conn_path_lock); - if (rds_conn_up(conn)) { + if (rds_conn_path_up(cp)) { mutex_unlock(&tc->t_conn_path_lock); return 0; } @@ -118,6 +118,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn) ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), O_NONBLOCK); + cp->cp_outgoing = 1; rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; @@ -125,7 +126,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn) rds_tcp_keepalive(sock); sock = NULL; } else { - rds_tcp_restore_callbacks(sock, conn->c_transport_data); + rds_tcp_restore_callbacks(sock, cp->cp_transport_data); } out: diff --git a/net/rds/threads.c b/net/rds/threads.c index f717b69e03f9..e8f0941f0548 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -152,8 +152,9 @@ void rds_connect_worker(struct work_struct *work) int ret; clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); - if (rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { - ret = conn->c_trans->conn_connect(conn); + ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); + if (ret) { + ret = conn->c_trans->conn_path_connect(cp); rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", conn, &conn->c_laddr, &conn->c_faddr, ret); -- cgit v1.2.3 From 8315011ad67670691545ed394968435f0a0bb29e Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Jun 2016 16:11:17 -0700 Subject: RDS: TCP: Simplify reconnect to avoid duelling reconnnect attempts When reconnecting, the peer with the smaller IP address will initiate the reconnect, to avoid needless duelling SYN issues. Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 4 +--- net/rds/threads.c | 5 +++++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 1b0c2a783b5e..19a4fee5f4dd 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -355,9 +355,7 @@ void rds_conn_shutdown(struct rds_conn_path *cp) rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); - if (conn->c_trans->t_type != RDS_TRANS_TCP || - cp->cp_outgoing == 1) - rds_queue_reconnect(cp); + rds_queue_reconnect(cp); } else { rcu_read_unlock(); } diff --git a/net/rds/threads.c b/net/rds/threads.c index e8f0941f0548..bc97d67f29cc 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -125,6 +125,11 @@ void rds_queue_reconnect(struct rds_conn_path *cp) conn, &conn->c_laddr, &conn->c_faddr, cp->cp_reconnect_jiffies); + /* let peer with smaller addr initiate reconnect, to avoid duels */ + if (conn->c_trans->t_type == RDS_TRANS_TCP && + conn->c_laddr > conn->c_faddr) + return; + set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); if (cp->cp_reconnect_jiffies == 0) { cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; -- cgit v1.2.3 From 11bb62f7c05240a933dd2e6b3bf3871d99464524 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Jun 2016 16:11:18 -0700 Subject: RDS: Do not send a pong to an incoming ping with 0 src port RDS ping messages are sent with a non-zero src port to a zero dst port, so that the rds pong messages can be sent back to the originators src port. However if a confused/malicious sender sends a ping with a 0 src port, we'd have an infinite ping-pong loop. To avoid this, the receiver should ignore ping messages with a 0 src port. Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/recv.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/rds/recv.c b/net/rds/recv.c index b58f50571782..fed53a6c2890 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -226,6 +226,10 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { + if (inc->i_hdr.h_sport == 0) { + rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr); + goto out; + } rds_stats_inc(s_recv_ping); rds_send_pong(cp, inc->i_hdr.h_sport); goto out; -- cgit v1.2.3 From 08f4b5918b2d6b491f0403cc1886f5cdccef89bb Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Fri, 1 Jul 2016 14:51:01 +0300 Subject: net/devlink: Add E-Switch mode control Add the commands to set and show the mode of SRIOV E-Switch, two modes are supported: * legacy: operating in the "old" L2 based mode (DMAC --> VF vport) * switchdev: the E-Switch is referred to as whitebox switch configured using standard tools such as tc, bridge, openvswitch etc. To allow working with the tools, for each VF, a VF representor netdevice is created by the E-Switch manager vendor device driver instance (e.g PF). Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- net/core/devlink.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 933e8d4d3968..b2e592a198c0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1394,6 +1394,78 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, return -EOPNOTSUPP; } +static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, u16 mode) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const struct devlink_ops *ops = devlink->ops; + struct sk_buff *msg; + u16 mode; + int err; + + if (!ops || !ops->eswitch_mode_get) + return -EOPNOTSUPP; + + err = ops->eswitch_mode_get(devlink, &mode); + if (err) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET, + info->snd_portid, info->snd_seq, 0, mode); + + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const struct devlink_ops *ops = devlink->ops; + u16 mode; + + if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) + return -EINVAL; + + mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + + if (ops && ops->eswitch_mode_set) + return ops->eswitch_mode_set(devlink, mode); + return -EOPNOTSUPP; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -1407,6 +1479,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, + [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -1525,6 +1598,20 @@ static const struct genl_ops devlink_nl_ops[] = { DEVLINK_NL_FLAG_NEED_SB | DEVLINK_NL_FLAG_LOCK_PORTS, }, + { + .cmd = DEVLINK_CMD_ESWITCH_MODE_GET, + .doit = devlink_nl_cmd_eswitch_mode_get_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_ESWITCH_MODE_SET, + .doit = devlink_nl_cmd_eswitch_mode_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; /** -- cgit v1.2.3 From c37a2dfa67f7920b14ea77dc9f9f9660f7a1f6dd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 24 Jun 2016 13:25:22 -0700 Subject: netfilter: Convert FWINV<[foo]> macros and uses to NF_INVF netfilter uses multiple FWINV #defines with identical form that hide a specific structure variable and dereference it with a invflags member. $ git grep "#define FWINV" include/linux/netfilter_bridge/ebtables.h:#define FWINV(bool,invflg) ((bool) ^ !!(info->invflags & invflg)) net/bridge/netfilter/ebtables.c:#define FWINV2(bool, invflg) ((bool) ^ !!(e->invflags & invflg)) net/ipv4/netfilter/arp_tables.c:#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) net/ipv4/netfilter/ip_tables.c:#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) net/ipv6/netfilter/ip6_tables.c:#define FWINV(bool, invflg) ((bool) ^ !!(ip6info->invflags & (invflg))) net/netfilter/xt_tcpudp.c:#define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg))) Consolidate these macros into a single NF_INVF macro. Miscellanea: o Neaten the alignment around these uses o A few lines are > 80 columns for intelligibility Signed-off-by: Joe Perches Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebt_802_3.c | 6 ++--- net/bridge/netfilter/ebt_arp.c | 38 +++++++++++++++-------------- net/bridge/netfilter/ebt_ip.c | 28 +++++++++++----------- net/bridge/netfilter/ebt_ip6.c | 41 +++++++++++++++++-------------- net/bridge/netfilter/ebt_stp.c | 52 +++++++++++++++++++++------------------- net/bridge/netfilter/ebtables.c | 27 +++++++++++---------- net/ipv4/netfilter/arp_tables.c | 41 ++++++++++++++++--------------- net/ipv4/netfilter/ip_tables.c | 20 +++++++--------- net/ipv6/netfilter/ip6_tables.c | 16 ++++++------- net/netfilter/xt_tcpudp.c | 7 ++---- 10 files changed, 140 insertions(+), 136 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c index 2a449b7ab8fa..5fc4affd9fdb 100644 --- a/net/bridge/netfilter/ebt_802_3.c +++ b/net/bridge/netfilter/ebt_802_3.c @@ -20,16 +20,16 @@ ebt_802_3_mt(const struct sk_buff *skb, struct xt_action_param *par) __be16 type = hdr->llc.ui.ctrl & IS_UI ? hdr->llc.ui.type : hdr->llc.ni.type; if (info->bitmask & EBT_802_3_SAP) { - if (FWINV(info->sap != hdr->llc.ui.ssap, EBT_802_3_SAP)) + if (NF_INVF(info, EBT_802_3_SAP, info->sap != hdr->llc.ui.ssap)) return false; - if (FWINV(info->sap != hdr->llc.ui.dsap, EBT_802_3_SAP)) + if (NF_INVF(info, EBT_802_3_SAP, info->sap != hdr->llc.ui.dsap)) return false; } if (info->bitmask & EBT_802_3_TYPE) { if (!(hdr->llc.ui.dsap == CHECK_TYPE && hdr->llc.ui.ssap == CHECK_TYPE)) return false; - if (FWINV(info->type != type, EBT_802_3_TYPE)) + if (NF_INVF(info, EBT_802_3_TYPE, info->type != type)) return false; } diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c index cca0a899ee15..227142282b45 100644 --- a/net/bridge/netfilter/ebt_arp.c +++ b/net/bridge/netfilter/ebt_arp.c @@ -25,14 +25,14 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); if (ah == NULL) return false; - if (info->bitmask & EBT_ARP_OPCODE && FWINV(info->opcode != - ah->ar_op, EBT_ARP_OPCODE)) + if ((info->bitmask & EBT_ARP_OPCODE) && + NF_INVF(info, EBT_ARP_OPCODE, info->opcode != ah->ar_op)) return false; - if (info->bitmask & EBT_ARP_HTYPE && FWINV(info->htype != - ah->ar_hrd, EBT_ARP_HTYPE)) + if ((info->bitmask & EBT_ARP_HTYPE) && + NF_INVF(info, EBT_ARP_HTYPE, info->htype != ah->ar_hrd)) return false; - if (info->bitmask & EBT_ARP_PTYPE && FWINV(info->ptype != - ah->ar_pro, EBT_ARP_PTYPE)) + if ((info->bitmask & EBT_ARP_PTYPE) && + NF_INVF(info, EBT_ARP_PTYPE, info->ptype != ah->ar_pro)) return false; if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP | EBT_ARP_GRAT)) { @@ -51,14 +51,16 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) sizeof(daddr), &daddr); if (dap == NULL) return false; - if (info->bitmask & EBT_ARP_SRC_IP && - FWINV(info->saddr != (*sap & info->smsk), EBT_ARP_SRC_IP)) + if ((info->bitmask & EBT_ARP_SRC_IP) && + NF_INVF(info, EBT_ARP_SRC_IP, + info->saddr != (*sap & info->smsk))) return false; - if (info->bitmask & EBT_ARP_DST_IP && - FWINV(info->daddr != (*dap & info->dmsk), EBT_ARP_DST_IP)) + if ((info->bitmask & EBT_ARP_DST_IP) && + NF_INVF(info, EBT_ARP_DST_IP, + info->daddr != (*dap & info->dmsk))) return false; - if (info->bitmask & EBT_ARP_GRAT && - FWINV(*dap != *sap, EBT_ARP_GRAT)) + if ((info->bitmask & EBT_ARP_GRAT) && + NF_INVF(info, EBT_ARP_GRAT, *dap != *sap)) return false; } @@ -73,9 +75,9 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) sizeof(_mac), &_mac); if (mp == NULL) return false; - if (FWINV(!ether_addr_equal_masked(mp, info->smaddr, - info->smmsk), - EBT_ARP_SRC_MAC)) + if (NF_INVF(info, EBT_ARP_SRC_MAC, + !ether_addr_equal_masked(mp, info->smaddr, + info->smmsk))) return false; } @@ -85,9 +87,9 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) sizeof(_mac), &_mac); if (mp == NULL) return false; - if (FWINV(!ether_addr_equal_masked(mp, info->dmaddr, - info->dmmsk), - EBT_ARP_DST_MAC)) + if (NF_INVF(info, EBT_ARP_DST_MAC, + !ether_addr_equal_masked(mp, info->dmaddr, + info->dmmsk))) return false; } } diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index 23bca62d58d2..d06968bdf5ec 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -36,19 +36,19 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (ih == NULL) return false; - if (info->bitmask & EBT_IP_TOS && - FWINV(info->tos != ih->tos, EBT_IP_TOS)) + if ((info->bitmask & EBT_IP_TOS) && + NF_INVF(info, EBT_IP_TOS, info->tos != ih->tos)) return false; - if (info->bitmask & EBT_IP_SOURCE && - FWINV((ih->saddr & info->smsk) != - info->saddr, EBT_IP_SOURCE)) + if ((info->bitmask & EBT_IP_SOURCE) && + NF_INVF(info, EBT_IP_SOURCE, + (ih->saddr & info->smsk) != info->saddr)) return false; if ((info->bitmask & EBT_IP_DEST) && - FWINV((ih->daddr & info->dmsk) != - info->daddr, EBT_IP_DEST)) + NF_INVF(info, EBT_IP_DEST, + (ih->daddr & info->dmsk) != info->daddr)) return false; if (info->bitmask & EBT_IP_PROTO) { - if (FWINV(info->protocol != ih->protocol, EBT_IP_PROTO)) + if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol)) return false; if (!(info->bitmask & EBT_IP_DPORT) && !(info->bitmask & EBT_IP_SPORT)) @@ -61,16 +61,16 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; if (info->bitmask & EBT_IP_DPORT) { u32 dst = ntohs(pptr->dst); - if (FWINV(dst < info->dport[0] || - dst > info->dport[1], - EBT_IP_DPORT)) + if (NF_INVF(info, EBT_IP_DPORT, + dst < info->dport[0] || + dst > info->dport[1])) return false; } if (info->bitmask & EBT_IP_SPORT) { u32 src = ntohs(pptr->src); - if (FWINV(src < info->sport[0] || - src > info->sport[1], - EBT_IP_SPORT)) + if (NF_INVF(info, EBT_IP_SPORT, + src < info->sport[0] || + src > info->sport[1])) return false; } } diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c index 98de6e7fd86d..4617491be41e 100644 --- a/net/bridge/netfilter/ebt_ip6.c +++ b/net/bridge/netfilter/ebt_ip6.c @@ -45,15 +45,18 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h); if (ih6 == NULL) return false; - if (info->bitmask & EBT_IP6_TCLASS && - FWINV(info->tclass != ipv6_get_dsfield(ih6), EBT_IP6_TCLASS)) + if ((info->bitmask & EBT_IP6_TCLASS) && + NF_INVF(info, EBT_IP6_TCLASS, + info->tclass != ipv6_get_dsfield(ih6))) return false; - if ((info->bitmask & EBT_IP6_SOURCE && - FWINV(ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk, - &info->saddr), EBT_IP6_SOURCE)) || - (info->bitmask & EBT_IP6_DEST && - FWINV(ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk, - &info->daddr), EBT_IP6_DEST))) + if (((info->bitmask & EBT_IP6_SOURCE) && + NF_INVF(info, EBT_IP6_SOURCE, + ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk, + &info->saddr))) || + ((info->bitmask & EBT_IP6_DEST) && + NF_INVF(info, EBT_IP6_DEST, + ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk, + &info->daddr)))) return false; if (info->bitmask & EBT_IP6_PROTO) { uint8_t nexthdr = ih6->nexthdr; @@ -63,7 +66,7 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr, &frag_off); if (offset_ph == -1) return false; - if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO)) + if (NF_INVF(info, EBT_IP6_PROTO, info->protocol != nexthdr)) return false; if (!(info->bitmask & (EBT_IP6_DPORT | EBT_IP6_SPORT | EBT_IP6_ICMP6))) @@ -76,22 +79,24 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; if (info->bitmask & EBT_IP6_DPORT) { u16 dst = ntohs(pptr->tcpudphdr.dst); - if (FWINV(dst < info->dport[0] || - dst > info->dport[1], EBT_IP6_DPORT)) + if (NF_INVF(info, EBT_IP6_DPORT, + dst < info->dport[0] || + dst > info->dport[1])) return false; } if (info->bitmask & EBT_IP6_SPORT) { u16 src = ntohs(pptr->tcpudphdr.src); - if (FWINV(src < info->sport[0] || - src > info->sport[1], EBT_IP6_SPORT)) + if (NF_INVF(info, EBT_IP6_SPORT, + src < info->sport[0] || + src > info->sport[1])) return false; } if ((info->bitmask & EBT_IP6_ICMP6) && - FWINV(pptr->icmphdr.type < info->icmpv6_type[0] || - pptr->icmphdr.type > info->icmpv6_type[1] || - pptr->icmphdr.code < info->icmpv6_code[0] || - pptr->icmphdr.code > info->icmpv6_code[1], - EBT_IP6_ICMP6)) + NF_INVF(info, EBT_IP6_ICMP6, + pptr->icmphdr.type < info->icmpv6_type[0] || + pptr->icmphdr.type > info->icmpv6_type[1] || + pptr->icmphdr.code < info->icmpv6_code[0] || + pptr->icmphdr.code > info->icmpv6_code[1])) return false; } return true; diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 45f73d55422f..3140eb912d7e 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -49,66 +49,68 @@ static bool ebt_filter_config(const struct ebt_stp_info *info, c = &info->config; if ((info->bitmask & EBT_STP_FLAGS) && - FWINV(c->flags != stpc->flags, EBT_STP_FLAGS)) + NF_INVF(info, EBT_STP_FLAGS, c->flags != stpc->flags)) return false; if (info->bitmask & EBT_STP_ROOTPRIO) { v16 = NR16(stpc->root); - if (FWINV(v16 < c->root_priol || v16 > c->root_priou, - EBT_STP_ROOTPRIO)) + if (NF_INVF(info, EBT_STP_ROOTPRIO, + v16 < c->root_priol || v16 > c->root_priou)) return false; } if (info->bitmask & EBT_STP_ROOTADDR) { - if (FWINV(!ether_addr_equal_masked(&stpc->root[2], c->root_addr, - c->root_addrmsk), - EBT_STP_ROOTADDR)) + if (NF_INVF(info, EBT_STP_ROOTADDR, + !ether_addr_equal_masked(&stpc->root[2], + c->root_addr, + c->root_addrmsk))) return false; } if (info->bitmask & EBT_STP_ROOTCOST) { v32 = NR32(stpc->root_cost); - if (FWINV(v32 < c->root_costl || v32 > c->root_costu, - EBT_STP_ROOTCOST)) + if (NF_INVF(info, EBT_STP_ROOTCOST, + v32 < c->root_costl || v32 > c->root_costu)) return false; } if (info->bitmask & EBT_STP_SENDERPRIO) { v16 = NR16(stpc->sender); - if (FWINV(v16 < c->sender_priol || v16 > c->sender_priou, - EBT_STP_SENDERPRIO)) + if (NF_INVF(info, EBT_STP_SENDERPRIO, + v16 < c->sender_priol || v16 > c->sender_priou)) return false; } if (info->bitmask & EBT_STP_SENDERADDR) { - if (FWINV(!ether_addr_equal_masked(&stpc->sender[2], - c->sender_addr, - c->sender_addrmsk), - EBT_STP_SENDERADDR)) + if (NF_INVF(info, EBT_STP_SENDERADDR, + !ether_addr_equal_masked(&stpc->sender[2], + c->sender_addr, + c->sender_addrmsk))) return false; } if (info->bitmask & EBT_STP_PORT) { v16 = NR16(stpc->port); - if (FWINV(v16 < c->portl || v16 > c->portu, EBT_STP_PORT)) + if (NF_INVF(info, EBT_STP_PORT, + v16 < c->portl || v16 > c->portu)) return false; } if (info->bitmask & EBT_STP_MSGAGE) { v16 = NR16(stpc->msg_age); - if (FWINV(v16 < c->msg_agel || v16 > c->msg_ageu, - EBT_STP_MSGAGE)) + if (NF_INVF(info, EBT_STP_MSGAGE, + v16 < c->msg_agel || v16 > c->msg_ageu)) return false; } if (info->bitmask & EBT_STP_MAXAGE) { v16 = NR16(stpc->max_age); - if (FWINV(v16 < c->max_agel || v16 > c->max_ageu, - EBT_STP_MAXAGE)) + if (NF_INVF(info, EBT_STP_MAXAGE, + v16 < c->max_agel || v16 > c->max_ageu)) return false; } if (info->bitmask & EBT_STP_HELLOTIME) { v16 = NR16(stpc->hello_time); - if (FWINV(v16 < c->hello_timel || v16 > c->hello_timeu, - EBT_STP_HELLOTIME)) + if (NF_INVF(info, EBT_STP_HELLOTIME, + v16 < c->hello_timel || v16 > c->hello_timeu)) return false; } if (info->bitmask & EBT_STP_FWDD) { v16 = NR16(stpc->forward_delay); - if (FWINV(v16 < c->forward_delayl || v16 > c->forward_delayu, - EBT_STP_FWDD)) + if (NF_INVF(info, EBT_STP_FWDD, + v16 < c->forward_delayl || v16 > c->forward_delayu)) return false; } return true; @@ -130,8 +132,8 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par) if (memcmp(sp, header, sizeof(header))) return false; - if (info->bitmask & EBT_STP_TYPE && - FWINV(info->type != sp->type, EBT_STP_TYPE)) + if ((info->bitmask & EBT_STP_TYPE) && + NF_INVF(info, EBT_STP_TYPE, info->type != sp->type)) return false; if (sp->type == BPDU_TYPE_CONFIG && diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5721a25be860..cceac5bb658f 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -121,7 +121,6 @@ ebt_dev_check(const char *entry, const struct net_device *device) return devname[i] != entry[i] && entry[i] != 1; } -#define FWINV2(bool, invflg) ((bool) ^ !!(e->invflags & invflg)) /* process standard matches */ static inline int ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, @@ -137,34 +136,36 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, ethproto = h->h_proto; if (e->bitmask & EBT_802_3) { - if (FWINV2(eth_proto_is_802_3(ethproto), EBT_IPROTO)) + if (NF_INVF(e, EBT_IPROTO, eth_proto_is_802_3(ethproto))) return 1; } else if (!(e->bitmask & EBT_NOPROTO) && - FWINV2(e->ethproto != ethproto, EBT_IPROTO)) + NF_INVF(e, EBT_IPROTO, e->ethproto != ethproto)) return 1; - if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN)) + if (NF_INVF(e, EBT_IIN, ebt_dev_check(e->in, in))) return 1; - if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT)) + if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out))) return 1; /* rcu_read_lock()ed by nf_hook_slow */ if (in && (p = br_port_get_rcu(in)) != NULL && - FWINV2(ebt_dev_check(e->logical_in, p->br->dev), EBT_ILOGICALIN)) + NF_INVF(e, EBT_ILOGICALIN, + ebt_dev_check(e->logical_in, p->br->dev))) return 1; if (out && (p = br_port_get_rcu(out)) != NULL && - FWINV2(ebt_dev_check(e->logical_out, p->br->dev), EBT_ILOGICALOUT)) + NF_INVF(e, EBT_ILOGICALOUT, + ebt_dev_check(e->logical_out, p->br->dev))) return 1; if (e->bitmask & EBT_SOURCEMAC) { - if (FWINV2(!ether_addr_equal_masked(h->h_source, - e->sourcemac, e->sourcemsk), - EBT_ISOURCE)) + if (NF_INVF(e, EBT_ISOURCE, + !ether_addr_equal_masked(h->h_source, e->sourcemac, + e->sourcemsk))) return 1; } if (e->bitmask & EBT_DESTMAC) { - if (FWINV2(!ether_addr_equal_masked(h->h_dest, - e->destmac, e->destmsk), - EBT_IDEST)) + if (NF_INVF(e, EBT_IDEST, + !ether_addr_equal_masked(h->h_dest, e->destmac, + e->destmsk))) return 1; } return 0; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 2033f929aa66..c8dd9e26b185 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -89,22 +89,20 @@ static inline int arp_packet_match(const struct arphdr *arphdr, __be32 src_ipaddr, tgt_ipaddr; long ret; -#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) - - if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, - ARPT_INV_ARPOP)) + if (NF_INVF(arpinfo, ARPT_INV_ARPOP, + (arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop)) return 0; - if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, - ARPT_INV_ARPHRD)) + if (NF_INVF(arpinfo, ARPT_INV_ARPHRD, + (arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd)) return 0; - if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, - ARPT_INV_ARPPRO)) + if (NF_INVF(arpinfo, ARPT_INV_ARPPRO, + (arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro)) return 0; - if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, - ARPT_INV_ARPHLN)) + if (NF_INVF(arpinfo, ARPT_INV_ARPHLN, + (arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln)) return 0; src_devaddr = arpptr; @@ -115,31 +113,32 @@ static inline int arp_packet_match(const struct arphdr *arphdr, arpptr += dev->addr_len; memcpy(&tgt_ipaddr, arpptr, sizeof(u32)); - if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), - ARPT_INV_SRCDEVADDR) || - FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), - ARPT_INV_TGTDEVADDR)) + if (NF_INVF(arpinfo, ARPT_INV_SRCDEVADDR, + arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, + dev->addr_len)) || + NF_INVF(arpinfo, ARPT_INV_TGTDEVADDR, + arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, + dev->addr_len))) return 0; - if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, - ARPT_INV_SRCIP) || - FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), - ARPT_INV_TGTIP)) + if (NF_INVF(arpinfo, ARPT_INV_SRCIP, + (src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr) || + NF_INVF(arpinfo, ARPT_INV_TGTIP, + (tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr)) return 0; /* Look for ifname matches. */ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_IN)) + if (NF_INVF(arpinfo, ARPT_INV_VIA_IN, ret != 0)) return 0; ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) + if (NF_INVF(arpinfo, ARPT_INV_VIA_OUT, ret != 0)) return 0; return 1; -#undef FWINV } static inline int arp_checkentry(const struct arpt_arp *arp) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 54906e0e8e0c..f0df66f54ce6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -58,32 +58,31 @@ ip_packet_match(const struct iphdr *ip, { unsigned long ret; -#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) - - if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, - IPT_INV_SRCIP) || - FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, - IPT_INV_DSTIP)) + if (NF_INVF(ipinfo, IPT_INV_SRCIP, + (ip->saddr & ipinfo->smsk.s_addr) != ipinfo->src.s_addr) || + NF_INVF(ipinfo, IPT_INV_DSTIP, + (ip->daddr & ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr)) return false; ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_IN)) + if (NF_INVF(ipinfo, IPT_INV_VIA_IN, ret != 0)) return false; ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_OUT)) + if (NF_INVF(ipinfo, IPT_INV_VIA_OUT, ret != 0)) return false; /* Check specific protocol */ if (ipinfo->proto && - FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) + NF_INVF(ipinfo, IPT_INV_PROTO, ip->protocol != ipinfo->proto)) return false; /* If we have a fragment rule but the packet is not a fragment * then we return zero */ - if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) + if (NF_INVF(ipinfo, IPT_INV_FRAG, + (ipinfo->flags & IPT_F_FRAG) && !isfrag)) return false; return true; @@ -122,7 +121,6 @@ static inline bool unconditional(const struct ipt_entry *e) return e->target_offset == sizeof(struct ipt_entry) && memcmp(&e->ip, &uncond, sizeof(uncond)) == 0; -#undef FWINV } /* for const-correctness */ diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 63e06c3dd319..61ed95054efa 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -73,22 +73,22 @@ ip6_packet_match(const struct sk_buff *skb, unsigned long ret; const struct ipv6hdr *ipv6 = ipv6_hdr(skb); -#define FWINV(bool, invflg) ((bool) ^ !!(ip6info->invflags & (invflg))) - - if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, - &ip6info->src), IP6T_INV_SRCIP) || - FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, - &ip6info->dst), IP6T_INV_DSTIP)) + if (NF_INVF(ip6info, IP6T_INV_SRCIP, + ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, + &ip6info->src)) || + NF_INVF(ip6info, IP6T_INV_DSTIP, + ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, + &ip6info->dst))) return false; ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_IN)) + if (NF_INVF(ip6info, IP6T_INV_VIA_IN, ret != 0)) return false; ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) + if (NF_INVF(ip6info, IP6T_INV_VIA_OUT, ret != 0)) return false; /* ... might want to do something with class and flowlabel here ... */ diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index c14d4645daa3..ade024c90f4f 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -83,8 +83,6 @@ static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; } -#define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg))) - th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) { /* We've been asked to examine this packet, and we @@ -102,9 +100,8 @@ static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par) ntohs(th->dest), !!(tcpinfo->invflags & XT_TCP_INV_DSTPT))) return false; - if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) - == tcpinfo->flg_cmp, - XT_TCP_INV_FLAGS)) + if (!NF_INVF(tcpinfo, XT_TCP_INV_FLAGS, + (((unsigned char *)th)[13] & tcpinfo->flg_mask) == tcpinfo->flg_cmp)) return false; if (tcpinfo->option) { if (th->doff * 4 < sizeof(_tcph)) { -- cgit v1.2.3 From f86dec94e3a86c992a637df1c301a4df25a85801 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 15 Apr 2016 18:14:25 +0200 Subject: NFC: hci: delete unused nfc_llc_get_rx_head_tail_room() It used to be EXPORTed, but then EXPORT usage was cleaned up (in 2012), without noticing that the function has no users at all (and curiously, never had any users). Delete it. While at it, remove non-static "inline" hints on nearby functions: these hints don't work across compilation units anyway, and these functions are not used in their .c file, thus they are never inlined. IOW: "inline" here does not help in any way. Signed-off-by: Denys Vlasenko CC: Samuel Ortiz CC: Christophe Ricard CC: linux-wireless@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Samuel Ortiz --- net/nfc/hci/llc.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index 1399a03fa6e6..3d699cbc7435 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -133,36 +133,29 @@ void nfc_llc_free(struct nfc_llc *llc) kfree(llc); } -inline void nfc_llc_get_rx_head_tail_room(struct nfc_llc *llc, int *rx_headroom, - int *rx_tailroom) -{ - *rx_headroom = llc->rx_headroom; - *rx_tailroom = llc->rx_tailroom; -} - -inline int nfc_llc_start(struct nfc_llc *llc) +int nfc_llc_start(struct nfc_llc *llc) { return llc->ops->start(llc); } EXPORT_SYMBOL(nfc_llc_start); -inline int nfc_llc_stop(struct nfc_llc *llc) +int nfc_llc_stop(struct nfc_llc *llc) { return llc->ops->stop(llc); } EXPORT_SYMBOL(nfc_llc_stop); -inline void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb) +void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb) { llc->ops->rcv_from_drv(llc, skb); } -inline int nfc_llc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) +int nfc_llc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) { return llc->ops->xmit_from_hci(llc, skb); } -inline void *nfc_llc_get_data(struct nfc_llc *llc) +void *nfc_llc_get_data(struct nfc_llc *llc) { return llc->data; } -- cgit v1.2.3 From 7854a44526de84142e367f08288c9f3a33c4c8ee Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Tue, 7 Jun 2016 16:21:52 +0200 Subject: NFC: digital: Add a delay between poll cycles This replaces the polling work struct with a delayed work struct and add a 10 ms delay between 2 poll cycles. This avoids to flood the device with 'switch off'/'switch on' commands. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_core.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index dd9003f38822..27769ac89d27 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -30,6 +30,9 @@ #define DIGITAL_PROTO_ISO15693_RF_TECH NFC_PROTO_ISO15693_MASK +/* Delay between each poll frame (ms) */ +#define DIGITAL_POLL_INTERVAL 10 + struct digital_cmd { struct list_head queue; @@ -419,7 +422,8 @@ void digital_poll_next_tech(struct nfc_digital_dev *ddev) mutex_unlock(&ddev->poll_lock); - schedule_work(&ddev->poll_work); + schedule_delayed_work(&ddev->poll_work, + msecs_to_jiffies(DIGITAL_POLL_INTERVAL)); } static void digital_wq_poll(struct work_struct *work) @@ -428,7 +432,7 @@ static void digital_wq_poll(struct work_struct *work) struct digital_poll_tech *poll_tech; struct nfc_digital_dev *ddev = container_of(work, struct nfc_digital_dev, - poll_work); + poll_work.work); mutex_lock(&ddev->poll_lock); if (!ddev->poll_tech_count) { @@ -543,7 +547,7 @@ static int digital_start_poll(struct nfc_dev *nfc_dev, __u32 im_protocols, return -EINVAL; } - schedule_work(&ddev->poll_work); + schedule_delayed_work(&ddev->poll_work, 0); return 0; } @@ -564,7 +568,7 @@ static void digital_stop_poll(struct nfc_dev *nfc_dev) mutex_unlock(&ddev->poll_lock); - cancel_work_sync(&ddev->poll_work); + cancel_delayed_work_sync(&ddev->poll_work); digital_abort_cmd(ddev); } @@ -770,7 +774,7 @@ struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops, INIT_WORK(&ddev->cmd_complete_work, digital_wq_cmd_complete); mutex_init(&ddev->poll_lock); - INIT_WORK(&ddev->poll_work, digital_wq_poll); + INIT_DELAYED_WORK(&ddev->poll_work, digital_wq_poll); if (supported_protocols & NFC_PROTO_JEWEL_MASK) ddev->protocols |= NFC_PROTO_JEWEL_MASK; @@ -832,7 +836,7 @@ void nfc_digital_unregister_device(struct nfc_digital_dev *ddev) ddev->poll_tech_count = 0; mutex_unlock(&ddev->poll_lock); - cancel_work_sync(&ddev->poll_work); + cancel_delayed_work_sync(&ddev->poll_work); cancel_work_sync(&ddev->cmd_work); cancel_work_sync(&ddev->cmd_complete_work); -- cgit v1.2.3 From 806bfe31c96f77e917eac476ba87164f7bbd1366 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Tue, 7 Jun 2016 16:21:53 +0200 Subject: NFC: llcp: Use dynamic debug for hex dump LLCP skb tx and rx functions now use print_hex_dump_debug() making these verbose traces controllable using dynamic debug. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/llcp_core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 98876274a1ee..e69786c6804c 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -732,9 +732,8 @@ static void nfc_llcp_tx_work(struct work_struct *work) int ret; pr_debug("Sending pending skb\n"); - print_hex_dump(KERN_DEBUG, "LLCP Tx: ", - DUMP_PREFIX_OFFSET, 16, 1, - skb->data, skb->len, true); + print_hex_dump_debug("LLCP Tx: ", DUMP_PREFIX_OFFSET, + 16, 1, skb->data, skb->len, true); if (ptype == LLCP_PDU_DISC && sk != NULL && sk->sk_state == LLCP_DISCONNECTING) { @@ -1412,8 +1411,8 @@ static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb) pr_debug("ptype 0x%x dsap 0x%x ssap 0x%x\n", ptype, dsap, ssap); if (ptype != LLCP_PDU_SYMM) - print_hex_dump(KERN_DEBUG, "LLCP Rx: ", DUMP_PREFIX_OFFSET, - 16, 1, skb->data, skb->len, true); + print_hex_dump_debug("LLCP Rx: ", DUMP_PREFIX_OFFSET, 16, 1, + skb->data, skb->len, true); switch (ptype) { case LLCP_PDU_SYMM: -- cgit v1.2.3 From 09748a22f4ab7b0ab5a83c432f6e18f65f18e09b Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Mon, 9 May 2016 18:41:08 +0200 Subject: batman-adv: add generic netlink family for batman-adv debugfs is currently severely broken virtually everywhere in the kernel where files are dynamically added and removed (see http://lkml.iu.edu/hypermail/linux/kernel/1506.1/02196.html for some details). In addition to that, debugfs is not namespace-aware. Instead of adding new debugfs entries, the whole infrastructure should be moved to netlink. This will fix the long standing problem of large buffers for debug tables and hard to parse text files. Signed-off-by: Matthias Schiffer Signed-off-by: Andrew Lunn [sven.eckelmann@open-mesh.com: Strip down patch to only add genl family, add missing kerneldoc] Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/Makefile | 1 + net/batman-adv/main.c | 3 +++ net/batman-adv/netlink.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ net/batman-adv/netlink.h | 26 ++++++++++++++++++++++ 4 files changed, 87 insertions(+) create mode 100644 net/batman-adv/netlink.c create mode 100644 net/batman-adv/netlink.h (limited to 'net') diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index a55f4ec97068..7da59014e134 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -35,6 +35,7 @@ batman-adv-y += icmp_socket.o batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o batman-adv-y += main.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o +batman-adv-y += netlink.o batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o batman-adv-y += routing.o diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index eab9d1b8a6eb..275604b7c64e 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -57,6 +57,7 @@ #include "icmp_socket.h" #include "log.h" #include "multicast.h" +#include "netlink.h" #include "network-coding.h" #include "originator.h" #include "packet.h" @@ -99,6 +100,7 @@ static int __init batadv_init(void) register_netdevice_notifier(&batadv_hard_if_notifier); rtnl_link_register(&batadv_link_ops); + batadv_netlink_register(); pr_info("B.A.T.M.A.N. advanced %s (compatibility version %i) loaded\n", BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION); @@ -109,6 +111,7 @@ static int __init batadv_init(void) static void __exit batadv_exit(void) { batadv_debugfs_destroy(); + batadv_netlink_unregister(); rtnl_link_unregister(&batadv_link_ops); unregister_netdevice_notifier(&batadv_hard_if_notifier); batadv_hardif_remove_interfaces(); diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c new file mode 100644 index 000000000000..9e4c865a9cc0 --- /dev/null +++ b/net/batman-adv/netlink.c @@ -0,0 +1,57 @@ +/* Copyright (C) 2016 B.A.T.M.A.N. contributors: + * + * Matthias Schiffer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "netlink.h" +#include "main.h" + +#include +#include +#include +#include +#include + +static struct genl_family batadv_netlink_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = BATADV_NL_NAME, + .version = 1, + .maxattr = BATADV_ATTR_MAX, +}; + +static struct genl_ops batadv_netlink_ops[] = { +}; + +/** + * batadv_netlink_register - register batadv genl netlink family + */ +void __init batadv_netlink_register(void) +{ + int ret; + + ret = genl_register_family_with_ops(&batadv_netlink_family, + batadv_netlink_ops); + if (ret) + pr_warn("unable to register netlink family"); +} + +/** + * batadv_netlink_unregister - unregister batadv genl netlink family + */ +void batadv_netlink_unregister(void) +{ + genl_unregister_family(&batadv_netlink_family); +} diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h new file mode 100644 index 000000000000..39044ccff662 --- /dev/null +++ b/net/batman-adv/netlink.h @@ -0,0 +1,26 @@ +/* Copyright (C) 2016 B.A.T.M.A.N. contributors: + * + * Matthias Schiffer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _NET_BATMAN_ADV_NETLINK_H_ +#define _NET_BATMAN_ADV_NETLINK_H_ + +#include "main.h" + +void batadv_netlink_register(void); +void batadv_netlink_unregister(void); + +#endif /* _NET_BATMAN_ADV_NETLINK_H_ */ -- cgit v1.2.3 From 5da0aef5e93591b373010c10f374c4161b37728c Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Mon, 9 May 2016 18:41:09 +0200 Subject: batman-adv: add netlink command to query generic mesh information files BATADV_CMD_GET_MESH_INFO is used to query basic information about a batman-adv softif (name, index and MAC address for both the softif and the primary hardif; routing algorithm; batman-adv version). Signed-off-by: Matthias Schiffer Signed-off-by: Andrew Lunn [sven.eckelmann@open-mesh.com: Reduce the number of changes to BATADV_CMD_GET_MESH_INFO, add missing kerneldoc, add policy for attributes] Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/netlink.c | 137 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) (limited to 'net') diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 9e4c865a9cc0..68152aa9bb26 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -18,12 +18,24 @@ #include "netlink.h" #include "main.h" +#include +#include #include +#include #include +#include +#include #include +#include #include +#include #include +#include "hard-interface.h" +#include "soft-interface.h" + +struct sk_buff; + static struct genl_family batadv_netlink_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, @@ -32,7 +44,132 @@ static struct genl_family batadv_netlink_family = { .maxattr = BATADV_ATTR_MAX, }; +static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { + [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, + [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, + [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, + [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING }, + [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, + [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, + [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, +}; + +/** + * batadv_netlink_mesh_info_put - fill in generic information about mesh + * interface + * @msg: netlink message to be sent back + * @soft_iface: interface for which the data should be taken + * + * Return: 0 on success, < 0 on error + */ +static int +batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_hard_iface *primary_if = NULL; + struct net_device *hard_iface; + int ret = -ENOBUFS; + + if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) || + nla_put_string(msg, BATADV_ATTR_ALGO_NAME, + bat_priv->bat_algo_ops->name) || + nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) || + nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) || + nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN, + soft_iface->dev_addr)) + goto out; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { + hard_iface = primary_if->net_dev; + + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + hard_iface->ifindex) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + hard_iface->name) || + nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, + hard_iface->dev_addr)) + goto out; + } + + ret = 0; + + out: + if (primary_if) + batadv_hardif_put(primary_if); + + return ret; +} + +/** + * batadv_netlink_get_mesh_info - handle incoming BATADV_CMD_GET_MESH_INFO + * netlink request + * @skb: received netlink message + * @info: receiver information + * + * Return: 0 on success, < 0 on error + */ +static int +batadv_netlink_get_mesh_info(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct net_device *soft_iface; + struct sk_buff *msg = NULL; + void *msg_head; + int ifindex; + int ret; + + if (!info->attrs[BATADV_ATTR_MESH_IFINDEX]) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } + + msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &batadv_netlink_family, 0, + BATADV_CMD_GET_MESH_INFO); + if (!msg_head) { + ret = -ENOBUFS; + goto out; + } + + ret = batadv_netlink_mesh_info_put(msg, soft_iface); + + out: + if (soft_iface) + dev_put(soft_iface); + + if (ret) { + if (msg) + nlmsg_free(msg); + return ret; + } + + genlmsg_end(msg, msg_head); + return genlmsg_reply(msg, info); +} + static struct genl_ops batadv_netlink_ops[] = { + { + .cmd = BATADV_CMD_GET_MESH_INFO, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .doit = batadv_netlink_get_mesh_info, + }, }; /** -- cgit v1.2.3 From f50ca95a691e9fd1fce530aade58c98d621cb1fe Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Wed, 18 May 2016 11:38:48 +0200 Subject: batman-adv: return netdev status in the TX path Return the proper netdev TX status along the TX path so that the tp_meter can understand when the queue is full and should stop sending packets. Signed-off-by: Antonio Quartulli Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 41 +++++++++++++++++++++++++---------------- net/batman-adv/fragmentation.h | 6 +++--- net/batman-adv/routing.c | 25 ++++++++++++++----------- net/batman-adv/send.c | 36 +++++++++++++++++++++++------------- net/batman-adv/tvlv.c | 4 +++- 5 files changed, 68 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 9f41a0a0d6ab..0934730fb7ff 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -433,11 +433,12 @@ err: * @orig_node: final destination of the created fragments * @neigh_node: next-hop of the created fragments * - * Return: true on success, false otherwise. + * Return: the netdev tx status or -1 in case of error. + * When -1 is returned the skb is not consumed. */ -bool batadv_frag_send_packet(struct sk_buff *skb, - struct batadv_orig_node *orig_node, - struct batadv_neigh_node *neigh_node) +int batadv_frag_send_packet(struct sk_buff *skb, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node) { struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; @@ -446,7 +447,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb, unsigned int mtu = neigh_node->if_incoming->net_dev->mtu; unsigned int header_size = sizeof(frag_header); unsigned int max_fragment_size, max_packet_size; - bool ret = false; + int ret = -1; /* To avoid merge and refragmentation at next-hops we never send * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE @@ -457,12 +458,12 @@ bool batadv_frag_send_packet(struct sk_buff *skb, /* Don't even try to fragment, if we need more than 16 fragments */ if (skb->len > max_packet_size) - goto out_err; + goto out; bat_priv = orig_node->bat_priv; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) - goto out_err; + goto out; /* Create one header to be copied to all fragments */ frag_header.packet_type = BATADV_UNICAST_FRAG; @@ -488,23 +489,33 @@ bool batadv_frag_send_packet(struct sk_buff *skb, while (skb->len > max_fragment_size) { skb_fragment = batadv_frag_create(skb, &frag_header, mtu); if (!skb_fragment) - goto out_err; + goto out; batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX); batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES, skb_fragment->len + ETH_HLEN); - batadv_send_unicast_skb(skb_fragment, neigh_node); + ret = batadv_send_unicast_skb(skb_fragment, neigh_node); + if (ret != NET_XMIT_SUCCESS) { + /* return -1 so that the caller can free the original + * skb + */ + ret = -1; + goto out; + } + frag_header.no++; /* The initial check in this function should cover this case */ - if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) - goto out_err; + if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) { + ret = -1; + goto out; + } } /* Make room for the fragment header. */ if (batadv_skb_head_push(skb, header_size) < 0 || pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) - goto out_err; + goto out; memcpy(skb->data, &frag_header, header_size); @@ -512,11 +523,9 @@ bool batadv_frag_send_packet(struct sk_buff *skb, batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX); batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES, skb->len + ETH_HLEN); - batadv_send_unicast_skb(skb, neigh_node); + ret = batadv_send_unicast_skb(skb, neigh_node); - ret = true; - -out_err: +out: if (primary_if) batadv_hardif_put(primary_if); diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 9ff77c7ef7c7..3202fe329e63 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -34,9 +34,9 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, struct batadv_orig_node *orig_node_src); bool batadv_frag_skb_buffer(struct sk_buff **skb, struct batadv_orig_node *orig_node); -bool batadv_frag_send_packet(struct sk_buff *skb, - struct batadv_orig_node *orig_node, - struct batadv_neigh_node *neigh_node); +int batadv_frag_send_packet(struct sk_buff *skb, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node); /** * batadv_frag_check_entry - check if a list of fragments has timed out diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 5833ab3008a1..76de583fe866 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -270,8 +270,10 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, icmph->ttl = BATADV_TTL; res = batadv_send_skb_to_orig(skb, orig_node, NULL); - if (res != NET_XMIT_DROP) - ret = NET_RX_SUCCESS; + if (res == -1) + goto out; + + ret = NET_RX_SUCCESS; break; default: @@ -292,7 +294,7 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if = NULL; struct batadv_orig_node *orig_node = NULL; struct batadv_icmp_packet *icmp_packet; - int ret = NET_RX_DROP; + int res, ret = NET_RX_DROP; icmp_packet = (struct batadv_icmp_packet *)skb->data; @@ -323,7 +325,8 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, icmp_packet->msg_type = BATADV_TTL_EXCEEDED; icmp_packet->ttl = BATADV_TTL; - if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP) + res = batadv_send_skb_to_orig(skb, orig_node, NULL); + if (res != -1) ret = NET_RX_SUCCESS; out: @@ -343,7 +346,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, struct ethhdr *ethhdr; struct batadv_orig_node *orig_node = NULL; int hdr_size = sizeof(struct batadv_icmp_header); - int ret = NET_RX_DROP; + int res, ret = NET_RX_DROP; /* drop packet if it has not necessary minimum size */ if (unlikely(!pskb_may_pull(skb, hdr_size))) @@ -409,7 +412,8 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, icmph->ttl--; /* route it */ - if (batadv_send_skb_to_orig(skb, orig_node, recv_if) != NET_XMIT_DROP) + res = batadv_send_skb_to_orig(skb, orig_node, recv_if); + if (res != -1) ret = NET_RX_SUCCESS; out: @@ -646,6 +650,8 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, len = skb->len; res = batadv_send_skb_to_orig(skb, orig_node, recv_if); + if (res == -1) + goto out; /* translate transmit result into receive result */ if (res == NET_XMIT_SUCCESS) { @@ -653,13 +659,10 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, len + ETH_HLEN); - - ret = NET_RX_SUCCESS; - } else if (res == -EINPROGRESS) { - /* skb was buffered and consumed */ - ret = NET_RX_SUCCESS; } + ret = NET_RX_SUCCESS; + out: if (orig_node) batadv_orig_node_put(orig_node); diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 3a59df26ee32..3a10d87b4b76 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -72,6 +73,7 @@ int batadv_send_skb_packet(struct sk_buff *skb, { struct batadv_priv *bat_priv; struct ethhdr *ethhdr; + int ret; bat_priv = netdev_priv(hard_iface->soft_iface); @@ -109,8 +111,15 @@ int batadv_send_skb_packet(struct sk_buff *skb, /* dev_queue_xmit() returns a negative result on error. However on * congestion and traffic shaping, it drops and returns NET_XMIT_DROP * (which is > 0). This will not be treated as an error. + * + * a negative value cannot be returned because it could be interepreted + * as not consumed skb by callers of batadv_send_skb_to_orig. */ - return dev_queue_xmit(skb); + ret = dev_queue_xmit(skb); + if (ret < 0) + ret = NET_XMIT_DROP; + + return ret; send_skb_err: kfree_skb(skb); return NET_XMIT_DROP; @@ -156,8 +165,11 @@ int batadv_send_unicast_skb(struct sk_buff *skb, * host, NULL can be passed as recv_if and no interface alternating is * attempted. * - * Return: NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or - * -EINPROGRESS if the skb is buffered for later transmit. + * Return: -1 on failure (and the skb is not consumed), -EINPROGRESS if the + * skb is buffered for later transmit or the NET_XMIT status returned by the + * lower routine if the packet has been passed down. + * + * If the returning value is not -1 the skb has been consumed. */ int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, @@ -165,7 +177,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, { struct batadv_priv *bat_priv = orig_node->bat_priv; struct batadv_neigh_node *neigh_node; - int ret = NET_XMIT_DROP; + int ret = -1; /* batadv_find_router() increases neigh_nodes refcount if found. */ neigh_node = batadv_find_router(bat_priv, orig_node, recv_if); @@ -178,8 +190,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, if (atomic_read(&bat_priv->fragmentation) && skb->len > neigh_node->if_incoming->net_dev->mtu) { /* Fragment and send packet. */ - if (batadv_frag_send_packet(skb, orig_node, neigh_node)) - ret = NET_XMIT_SUCCESS; + ret = batadv_frag_send_packet(skb, orig_node, neigh_node); goto out; } @@ -188,12 +199,10 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, * (i.e. being forwarded). If the packet originates from this node or if * network coding fails, then send the packet as usual. */ - if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) { + if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) ret = -EINPROGRESS; - } else { - batadv_send_unicast_skb(skb, neigh_node); - ret = NET_XMIT_SUCCESS; - } + else + ret = batadv_send_unicast_skb(skb, neigh_node); out: if (neigh_node) @@ -319,7 +328,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, { struct batadv_unicast_packet *unicast_packet; struct ethhdr *ethhdr; - int ret = NET_XMIT_DROP; + int res, ret = NET_XMIT_DROP; if (!orig_node) goto out; @@ -356,7 +365,8 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid)) unicast_packet->ttvn = unicast_packet->ttvn - 1; - if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP) + res = batadv_send_skb_to_orig(skb, orig_node, NULL); + if (res != -1) ret = NET_XMIT_SUCCESS; out: diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 2fd542e0d6a8..3d1cf0fb112d 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -591,6 +591,7 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, unsigned char *tvlv_buff; unsigned int tvlv_len; ssize_t hdr_len = sizeof(*unicast_tvlv_packet); + int res; orig_node = batadv_orig_hash_find(bat_priv, dst); if (!orig_node) @@ -623,7 +624,8 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, tvlv_buff += sizeof(*tvlv_hdr); memcpy(tvlv_buff, tvlv_value, tvlv_value_len); - if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP) + res = batadv_send_skb_to_orig(skb, orig_node, NULL); + if (res == -1) kfree_skb(skb); out: batadv_orig_node_put(orig_node); -- cgit v1.2.3 From 33a3bb4a3345bb511f9c69c913da95d4693e2a4e Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Thu, 5 May 2016 13:09:43 +0200 Subject: batman-adv: throughput meter implementation The throughput meter module is a simple, kernel-space replacement for throughtput measurements tool like iperf and netperf. It is intended to approximate TCP behaviour. It is invoked through batctl: the protocol is connection oriented, with cumulative acknowledgment and a dynamic-size sliding window. The test *can* be interrupted by batctl. A receiver side timeout avoids unlimited waitings for sender packets: after one second of inactivity, the receiver abort the ongoing test. Based on a prototype from Edo Monticelli Signed-off-by: Antonio Quartulli Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/Makefile | 1 + net/batman-adv/log.h | 18 +- net/batman-adv/main.c | 4 + net/batman-adv/main.h | 8 + net/batman-adv/netlink.c | 234 +++++- net/batman-adv/netlink.h | 6 + net/batman-adv/packet.h | 54 ++ net/batman-adv/routing.c | 8 + net/batman-adv/soft-interface.c | 2 + net/batman-adv/tp_meter.c | 1507 +++++++++++++++++++++++++++++++++++++++ net/batman-adv/tp_meter.h | 34 + net/batman-adv/types.h | 112 +++ 12 files changed, 1978 insertions(+), 10 deletions(-) create mode 100644 net/batman-adv/tp_meter.c create mode 100644 net/batman-adv/tp_meter.h (limited to 'net') diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 7da59014e134..a83fc6c58d19 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -42,5 +42,6 @@ batman-adv-y += routing.o batman-adv-y += send.o batman-adv-y += soft-interface.o batman-adv-y += sysfs.o +batman-adv-y += tp_meter.o batman-adv-y += translation-table.o batman-adv-y += tvlv.o diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 9948e56eabaa..e0e1a88c3e58 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -51,17 +51,19 @@ static inline void batadv_debug_log_cleanup(struct batadv_priv *bat_priv) * @BATADV_DBG_DAT: ARP snooping and DAT related messages * @BATADV_DBG_NC: network coding related messages * @BATADV_DBG_MCAST: multicast related messages + * @BATADV_DBG_TP_METER: throughput meter messages * @BATADV_DBG_ALL: the union of all the above log levels */ enum batadv_dbg_level { - BATADV_DBG_BATMAN = BIT(0), - BATADV_DBG_ROUTES = BIT(1), - BATADV_DBG_TT = BIT(2), - BATADV_DBG_BLA = BIT(3), - BATADV_DBG_DAT = BIT(4), - BATADV_DBG_NC = BIT(5), - BATADV_DBG_MCAST = BIT(6), - BATADV_DBG_ALL = 127, + BATADV_DBG_BATMAN = BIT(0), + BATADV_DBG_ROUTES = BIT(1), + BATADV_DBG_TT = BIT(2), + BATADV_DBG_BLA = BIT(3), + BATADV_DBG_DAT = BIT(4), + BATADV_DBG_NC = BIT(5), + BATADV_DBG_MCAST = BIT(6), + BATADV_DBG_TP_METER = BIT(7), + BATADV_DBG_ALL = 127, }; #ifdef CONFIG_BATMAN_ADV_DEBUG diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 275604b7c64e..fe4c5e29f96b 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -64,6 +64,7 @@ #include "routing.h" #include "send.h" #include "soft-interface.h" +#include "tp_meter.h" #include "translation-table.h" /* List manipulations on hardif_list have to be rtnl_lock()'ed, @@ -89,6 +90,7 @@ static int __init batadv_init(void) batadv_v_init(); batadv_iv_init(); batadv_nc_init(); + batadv_tp_meter_init(); batadv_event_workqueue = create_singlethread_workqueue("bat_events"); @@ -142,6 +144,7 @@ int batadv_mesh_init(struct net_device *soft_iface) spin_lock_init(&bat_priv->tvlv.container_list_lock); spin_lock_init(&bat_priv->tvlv.handler_list_lock); spin_lock_init(&bat_priv->softif_vlan_list_lock); + spin_lock_init(&bat_priv->tp_list_lock); INIT_HLIST_HEAD(&bat_priv->forw_bat_list); INIT_HLIST_HEAD(&bat_priv->forw_bcast_list); @@ -160,6 +163,7 @@ int batadv_mesh_init(struct net_device *soft_iface) INIT_HLIST_HEAD(&bat_priv->tvlv.container_list); INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list); INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); + INIT_HLIST_HEAD(&bat_priv->tp_list); ret = batadv_v_mesh_init(bat_priv); if (ret < 0) diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 857fb5a4e37a..06a860845434 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -100,6 +100,9 @@ #define BATADV_NUM_BCASTS_WIRELESS 3 #define BATADV_NUM_BCASTS_MAX 3 +/* length of the single packet used by the TP meter */ +#define BATADV_TP_PACKET_LEN ETH_DATA_LEN + /* msecs after which an ARP_REQUEST is sent in broadcast as fallback */ #define ARP_REQ_DELAY 250 /* numbers of originator to contact for any PUT/GET DHT operation */ @@ -131,6 +134,11 @@ #define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */ +/** + * BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions + */ +#define BATADV_TP_MAX_NUM 5 + enum batadv_mesh_state { BATADV_MESH_INACTIVE, BATADV_MESH_ACTIVE, diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 68152aa9bb26..c25bbb8ab06c 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -27,12 +27,14 @@ #include #include #include +#include #include #include #include #include "hard-interface.h" #include "soft-interface.h" +#include "tp_meter.h" struct sk_buff; @@ -44,6 +46,15 @@ static struct genl_family batadv_netlink_family = { .maxattr = BATADV_ATTR_MAX, }; +/* multicast groups */ +enum batadv_netlink_multicast_groups { + BATADV_NL_MCGRP_TPMETER, +}; + +static struct genl_multicast_group batadv_netlink_mcgrps[] = { + [BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER }, +}; + static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, @@ -53,6 +64,11 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 }, + [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, + [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, + [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, }; /** @@ -163,6 +179,207 @@ batadv_netlink_get_mesh_info(struct sk_buff *skb, struct genl_info *info) return genlmsg_reply(msg, info); } +/** + * batadv_netlink_tp_meter_put - Fill information of started tp_meter session + * @msg: netlink message to be sent back + * @cookie: tp meter session cookie + * + * Return: 0 on success, < 0 on error + */ +static int +batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) +{ + if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie)) + return -ENOBUFS; + + return 0; +} + +/** + * batadv_netlink_tpmeter_notify - send tp_meter result via netlink to client + * @bat_priv: the bat priv with all the soft interface information + * @dst: destination of tp_meter session + * @result: reason for tp meter session stop + * @test_time: total time ot the tp_meter session + * @total_bytes: bytes acked to the receiver + * @cookie: cookie of tp_meter session + * + * Return: 0 on success, < 0 on error + */ +int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, + u8 result, u32 test_time, u64 total_bytes, + u32 cookie) +{ + struct sk_buff *msg; + void *hdr; + int ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &batadv_netlink_family, 0, + BATADV_CMD_TP_METER); + if (!hdr) { + ret = -ENOBUFS; + goto err_genlmsg; + } + + if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie)) + goto nla_put_failure; + + if (nla_put_u32(msg, BATADV_ATTR_TPMETER_TEST_TIME, test_time)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, BATADV_ATTR_TPMETER_BYTES, total_bytes, + BATADV_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u8(msg, BATADV_ATTR_TPMETER_RESULT, result)) + goto nla_put_failure; + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, dst)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&batadv_netlink_family, + dev_net(bat_priv->soft_iface), msg, 0, + BATADV_NL_MCGRP_TPMETER, GFP_KERNEL); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + ret = -EMSGSIZE; + +err_genlmsg: + nlmsg_free(msg); + return ret; +} + +/** + * batadv_netlink_tp_meter_start - Start a new tp_meter session + * @skb: received netlink message + * @info: receiver information + * + * Return: 0 on success, < 0 on error + */ +static int +batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + struct sk_buff *msg = NULL; + u32 test_length; + void *msg_head; + int ifindex; + u32 cookie; + u8 *dst; + int ret; + + if (!info->attrs[BATADV_ATTR_MESH_IFINDEX]) + return -EINVAL; + + if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS]) + return -EINVAL; + + if (!info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]); + if (!ifindex) + return -EINVAL; + + dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]); + + test_length = nla_get_u32(info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]); + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } + + msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &batadv_netlink_family, 0, + BATADV_CMD_TP_METER); + if (!msg_head) { + ret = -ENOBUFS; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + batadv_tp_start(bat_priv, dst, test_length, &cookie); + + ret = batadv_netlink_tp_meter_put(msg, cookie); + + out: + if (soft_iface) + dev_put(soft_iface); + + if (ret) { + if (msg) + nlmsg_free(msg); + return ret; + } + + genlmsg_end(msg, msg_head); + return genlmsg_reply(msg, info); +} + +/** + * batadv_netlink_tp_meter_start - Cancel a running tp_meter session + * @skb: received netlink message + * @info: receiver information + * + * Return: 0 on success, < 0 on error + */ +static int +batadv_netlink_tp_meter_cancel(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + int ifindex; + u8 *dst; + int ret = 0; + + if (!info->attrs[BATADV_ATTR_MESH_IFINDEX]) + return -EINVAL; + + if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS]) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]); + if (!ifindex) + return -EINVAL; + + dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]); + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + batadv_tp_stop(bat_priv, dst, BATADV_TP_REASON_CANCEL); + +out: + if (soft_iface) + dev_put(soft_iface); + + return ret; +} + static struct genl_ops batadv_netlink_ops[] = { { .cmd = BATADV_CMD_GET_MESH_INFO, @@ -170,6 +387,18 @@ static struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .doit = batadv_netlink_get_mesh_info, }, + { + .cmd = BATADV_CMD_TP_METER, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .doit = batadv_netlink_tp_meter_start, + }, + { + .cmd = BATADV_CMD_TP_METER_CANCEL, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .doit = batadv_netlink_tp_meter_cancel, + }, }; /** @@ -179,8 +408,9 @@ void __init batadv_netlink_register(void) { int ret; - ret = genl_register_family_with_ops(&batadv_netlink_family, - batadv_netlink_ops); + ret = genl_register_family_with_ops_groups(&batadv_netlink_family, + batadv_netlink_ops, + batadv_netlink_mcgrps); if (ret) pr_warn("unable to register netlink family"); } diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 39044ccff662..945653ab58c6 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -20,7 +20,13 @@ #include "main.h" +#include + void batadv_netlink_register(void); void batadv_netlink_unregister(void); +int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, + u8 result, u32 test_time, u64 total_bytes, + u32 cookie); + #endif /* _NET_BATMAN_ADV_NETLINK_H_ */ diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 71567794df17..6b011ff64dd8 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -21,6 +21,8 @@ #include #include +#define batadv_tp_is_error(n) ((u8)n > 127 ? 1 : 0) + /** * enum batadv_packettype - types for batman-adv encapsulated packets * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV @@ -93,6 +95,7 @@ enum batadv_icmp_packettype { BATADV_ECHO_REQUEST = 8, BATADV_TTL_EXCEEDED = 11, BATADV_PARAMETER_PROBLEM = 12, + BATADV_TP = 15, }; /** @@ -284,6 +287,16 @@ struct batadv_elp_packet { #define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet) +/** + * enum batadv_icmp_user_cmd_type - types for batman-adv icmp cmd modes + * @BATADV_TP_START: start a throughput meter run + * @BATADV_TP_STOP: stop a throughput meter run + */ +enum batadv_icmp_user_cmd_type { + BATADV_TP_START = 0, + BATADV_TP_STOP = 2, +}; + /** * struct batadv_icmp_header - common members among all the ICMP packets * @packet_type: batman-adv packet type, part of the general header @@ -334,6 +347,47 @@ struct batadv_icmp_packet { __be16 seqno; }; +/** + * struct batadv_icmp_tp_packet - ICMP TP Meter packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @msg_type: ICMP packet type + * @dst: address of the destination node + * @orig: address of the source node + * @uid: local ICMP socket identifier + * @subtype: TP packet subtype (see batadv_icmp_tp_subtype) + * @session: TP session identifier + * @seqno: the TP sequence number + * @timestamp: time when the packet has been sent. This value is filled in a + * TP_MSG and echoed back in the next TP_ACK so that the sender can compute the + * RTT. Since it is read only by the host which wrote it, there is no need to + * store it using network order + */ +struct batadv_icmp_tp_packet { + u8 packet_type; + u8 version; + u8 ttl; + u8 msg_type; /* see ICMP message types above */ + u8 dst[ETH_ALEN]; + u8 orig[ETH_ALEN]; + u8 uid; + u8 subtype; + u8 session[2]; + __be32 seqno; + __be32 timestamp; +}; + +/** + * enum batadv_icmp_tp_subtype - ICMP TP Meter packet subtypes + * @BATADV_TP_MSG: Msg from sender to receiver + * @BATADV_TP_ACK: acknowledgment from receiver to sender + */ +enum batadv_icmp_tp_subtype { + BATADV_TP_MSG = 0, + BATADV_TP_ACK, +}; + #define BATADV_RR_LEN 16 /** diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 76de583fe866..7b5de402ee0d 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -46,6 +46,7 @@ #include "packet.h" #include "send.h" #include "soft-interface.h" +#include "tp_meter.h" #include "translation-table.h" #include "tvlv.h" @@ -276,6 +277,13 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, ret = NET_RX_SUCCESS; break; + case BATADV_TP: + if (!pskb_may_pull(skb, sizeof(struct batadv_icmp_tp_packet))) + goto out; + + batadv_tp_meter_recv(bat_priv, skb); + ret = NET_RX_SUCCESS; + goto out; default: /* drop unknown type */ goto out; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index f75631e21e48..18b6d07c3233 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -842,6 +842,8 @@ static int batadv_softif_init_late(struct net_device *dev) #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); #endif + atomic_set(&bat_priv->tp_num, 0); + bat_priv->tt.last_changeset = NULL; bat_priv->tt.last_changeset_len = 0; bat_priv->isolation_mark = 0; diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c new file mode 100644 index 000000000000..2333777f919d --- /dev/null +++ b/net/batman-adv/tp_meter.c @@ -0,0 +1,1507 @@ +/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: + * + * Edo Monticelli, Antonio Quartulli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "tp_meter.h" +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hard-interface.h" +#include "log.h" +#include "netlink.h" +#include "originator.h" +#include "packet.h" +#include "send.h" + +/** + * BATADV_TP_DEF_TEST_LENGTH - Default test length if not specified by the user + * in milliseconds + */ +#define BATADV_TP_DEF_TEST_LENGTH 10000 + +/** + * BATADV_TP_AWND - Advertised window by the receiver (in bytes) + */ +#define BATADV_TP_AWND 0x20000000 + +/** + * BATADV_TP_RECV_TIMEOUT - Receiver activity timeout. If the receiver does not + * get anything for such amount of milliseconds, the connection is killed + */ +#define BATADV_TP_RECV_TIMEOUT 1000 + +/** + * BATADV_TP_MAX_RTO - Maximum sender timeout. If the sender RTO gets beyond + * such amound of milliseconds, the receiver is considered unreachable and the + * connection is killed + */ +#define BATADV_TP_MAX_RTO 30000 + +/** + * BATADV_TP_FIRST_SEQ - First seqno of each session. The number is rather high + * in order to immediately trigger a wrap around (test purposes) + */ +#define BATADV_TP_FIRST_SEQ ((u32)-1 - 2000) + +/** + * BATADV_TP_PLEN - length of the payload (data after the batadv_unicast header) + * to simulate + */ +#define BATADV_TP_PLEN (BATADV_TP_PACKET_LEN - ETH_HLEN - \ + sizeof(struct batadv_unicast_packet)) + +static u8 batadv_tp_prerandom[4096] __read_mostly; + +/** + * batadv_tp_session_cookie - generate session cookie based on session ids + * @session: TP session identifier + * @icmp_uid: icmp pseudo uid of the tp session + * + * Return: 32 bit tp_meter session cookie + */ +static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid) +{ + u32 cookie; + + cookie = icmp_uid << 16; + cookie |= session[0] << 8; + cookie |= session[1]; + + return cookie; +} + +/** + * batadv_tp_cwnd - compute the new cwnd size + * @base: base cwnd size value + * @increment: the value to add to base to get the new size + * @min: minumim cwnd value (usually MSS) + * + * Return the new cwnd size and ensures it does not exceed the Advertised + * Receiver Window size. It is wrap around safe. + * For details refer to Section 3.1 of RFC5681 + * + * Return: new congestion window size in bytes + */ +static u32 batadv_tp_cwnd(u32 base, u32 increment, u32 min) +{ + u32 new_size = base + increment; + + /* check for wrap-around */ + if (new_size < base) + new_size = (u32)ULONG_MAX; + + new_size = min_t(u32, new_size, BATADV_TP_AWND); + + return max_t(u32, new_size, min); +} + +/** + * batadv_tp_updated_cwnd - update the Congestion Windows + * @tp_vars: the private data of the current TP meter session + * @mss: maximum segment size of transmission + * + * 1) if the session is in Slow Start, the CWND has to be increased by 1 + * MSS every unique received ACK + * 2) if the session is in Congestion Avoidance, the CWND has to be + * increased by MSS * MSS / CWND for every unique received ACK + */ +static void batadv_tp_update_cwnd(struct batadv_tp_vars *tp_vars, u32 mss) +{ + spin_lock_bh(&tp_vars->cwnd_lock); + + /* slow start... */ + if (tp_vars->cwnd <= tp_vars->ss_threshold) { + tp_vars->dec_cwnd = 0; + tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, mss, mss); + spin_unlock_bh(&tp_vars->cwnd_lock); + return; + } + + /* increment CWND at least of 1 (section 3.1 of RFC5681) */ + tp_vars->dec_cwnd += max_t(u32, 1U << 3, + ((mss * mss) << 6) / (tp_vars->cwnd << 3)); + if (tp_vars->dec_cwnd < (mss << 3)) { + spin_unlock_bh(&tp_vars->cwnd_lock); + return; + } + + tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, mss, mss); + tp_vars->dec_cwnd = 0; + + spin_unlock_bh(&tp_vars->cwnd_lock); +} + +/** + * batadv_tp_update_rto - calculate new retransmission timeout + * @tp_vars: the private data of the current TP meter session + * @new_rtt: new roundtrip time in msec + */ +static void batadv_tp_update_rto(struct batadv_tp_vars *tp_vars, + u32 new_rtt) +{ + long m = new_rtt; + + /* RTT update + * Details in Section 2.2 and 2.3 of RFC6298 + * + * It's tricky to understand. Don't lose hair please. + * Inspired by tcp_rtt_estimator() tcp_input.c + */ + if (tp_vars->srtt != 0) { + m -= (tp_vars->srtt >> 3); /* m is now error in rtt est */ + tp_vars->srtt += m; /* rtt = 7/8 srtt + 1/8 new */ + if (m < 0) + m = -m; + + m -= (tp_vars->rttvar >> 2); + tp_vars->rttvar += m; /* mdev ~= 3/4 rttvar + 1/4 new */ + } else { + /* first measure getting in */ + tp_vars->srtt = m << 3; /* take the measured time to be srtt */ + tp_vars->rttvar = m << 1; /* new_rtt / 2 */ + } + + /* rto = srtt + 4 * rttvar. + * rttvar is scaled by 4, therefore doesn't need to be multiplied + */ + tp_vars->rto = (tp_vars->srtt >> 3) + tp_vars->rttvar; +} + +/** + * batadv_tp_batctl_notify - send client status result to client + * @reason: reason for tp meter session stop + * @dst: destination of tp_meter session + * @bat_priv: the bat priv with all the soft interface information + * @start_time: start of transmission in jiffies + * @total_sent: bytes acked to the receiver + * @cookie: cookie of tp_meter session + */ +static void batadv_tp_batctl_notify(enum batadv_tp_meter_reason reason, + const u8 *dst, struct batadv_priv *bat_priv, + unsigned long start_time, u64 total_sent, + u32 cookie) +{ + u32 test_time; + u8 result; + u32 total_bytes; + + if (!batadv_tp_is_error(reason)) { + result = BATADV_TP_REASON_COMPLETE; + test_time = jiffies_to_msecs(jiffies - start_time); + total_bytes = total_sent; + } else { + result = reason; + test_time = 0; + total_bytes = 0; + } + + batadv_netlink_tpmeter_notify(bat_priv, dst, result, test_time, + total_bytes, cookie); +} + +/** + * batadv_tp_batctl_error_notify - send client error result to client + * @reason: reason for tp meter session stop + * @dst: destination of tp_meter session + * @bat_priv: the bat priv with all the soft interface information + * @cookie: cookie of tp_meter session + */ +static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, + const u8 *dst, + struct batadv_priv *bat_priv, + u32 cookie) +{ + batadv_tp_batctl_notify(reason, dst, bat_priv, 0, 0, cookie); +} + +/** + * batadv_tp_list_find - find a tp_vars object in the global list + * @bat_priv: the bat priv with all the soft interface information + * @dst: the other endpoint MAC address to look for + * + * Look for a tp_vars object matching dst as end_point and return it after + * having incremented the refcounter. Return NULL is not found + * + * Return: matching tp_vars or NULL when no tp_vars with @dst was found + */ +static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, + const u8 *dst) +{ + struct batadv_tp_vars *pos, *tp_vars = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(pos, &bat_priv->tp_list, list) { + if (!batadv_compare_eth(pos->other_end, dst)) + continue; + + /* most of the time this function is invoked during the normal + * process..it makes sens to pay more when the session is + * finished and to speed the process up during the measurement + */ + if (unlikely(!kref_get_unless_zero(&pos->refcount))) + continue; + + tp_vars = pos; + break; + } + rcu_read_unlock(); + + return tp_vars; +} + +/** + * batadv_tp_list_find_session - find tp_vars session object in the global list + * @bat_priv: the bat priv with all the soft interface information + * @dst: the other endpoint MAC address to look for + * @session: session identifier + * + * Look for a tp_vars object matching dst as end_point, session as tp meter + * session and return it after having incremented the refcounter. Return NULL + * is not found + * + * Return: matching tp_vars or NULL when no tp_vars was found + */ +static struct batadv_tp_vars * +batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst, + const u8 *session) +{ + struct batadv_tp_vars *pos, *tp_vars = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(pos, &bat_priv->tp_list, list) { + if (!batadv_compare_eth(pos->other_end, dst)) + continue; + + if (memcmp(pos->session, session, sizeof(pos->session)) != 0) + continue; + + /* most of the time this function is invoked during the normal + * process..it makes sense to pay more when the session is + * finished and to speed the process up during the measurement + */ + if (unlikely(!kref_get_unless_zero(&pos->refcount))) + continue; + + tp_vars = pos; + break; + } + rcu_read_unlock(); + + return tp_vars; +} + +/** + * batadv_tp_vars_release - release batadv_tp_vars from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the batadv_tp_vars + */ +static void batadv_tp_vars_release(struct kref *ref) +{ + struct batadv_tp_vars *tp_vars; + struct batadv_tp_unacked *un, *safe; + + tp_vars = container_of(ref, struct batadv_tp_vars, refcount); + + /* lock should not be needed because this object is now out of any + * context! + */ + spin_lock_bh(&tp_vars->unacked_lock); + list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { + list_del(&un->list); + kfree(un); + } + spin_unlock_bh(&tp_vars->unacked_lock); + + kfree_rcu(tp_vars, rcu); +} + +/** + * batadv_tp_vars_put - decrement the batadv_tp_vars refcounter and possibly + * release it + * @tp_vars: the private data of the current TP meter session to be free'd + */ +static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars) +{ + kref_put(&tp_vars->refcount, batadv_tp_vars_release); +} + +/** + * batadv_tp_sender_cleanup - cleanup sender data and drop and timer + * @bat_priv: the bat priv with all the soft interface information + * @tp_vars: the private data of the current TP meter session to cleanup + */ +static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, + struct batadv_tp_vars *tp_vars) +{ + cancel_delayed_work(&tp_vars->finish_work); + + spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); + hlist_del_rcu(&tp_vars->list); + spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); + + /* drop list reference */ + batadv_tp_vars_put(tp_vars); + + atomic_dec(&tp_vars->bat_priv->tp_num); + + /* kill the timer and remove its reference */ + del_timer_sync(&tp_vars->timer); + /* the worker might have rearmed itself therefore we kill it again. Note + * that if the worker should run again before invoking the following + * del_timer(), it would not re-arm itself once again because the status + * is OFF now + */ + del_timer(&tp_vars->timer); + batadv_tp_vars_put(tp_vars); +} + +/** + * batadv_tp_sender_end - print info about ended session and inform client + * @bat_priv: the bat priv with all the soft interface information + * @tp_vars: the private data of the current TP meter session + */ +static void batadv_tp_sender_end(struct batadv_priv *bat_priv, + struct batadv_tp_vars *tp_vars) +{ + u32 session_cookie; + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Test towards %pM finished..shutting down (reason=%d)\n", + tp_vars->other_end, tp_vars->reason); + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Last timing stats: SRTT=%ums RTTVAR=%ums RTO=%ums\n", + tp_vars->srtt >> 3, tp_vars->rttvar >> 2, tp_vars->rto); + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Final values: cwnd=%u ss_threshold=%u\n", + tp_vars->cwnd, tp_vars->ss_threshold); + + session_cookie = batadv_tp_session_cookie(tp_vars->session, + tp_vars->icmp_uid); + + batadv_tp_batctl_notify(tp_vars->reason, + tp_vars->other_end, + bat_priv, + tp_vars->start_time, + atomic64_read(&tp_vars->tot_sent), + session_cookie); +} + +/** + * batadv_tp_sender_shutdown - let sender thread/timer stop gracefully + * @tp_vars: the private data of the current TP meter session + * @reason: reason for tp meter session stop + */ +static void batadv_tp_sender_shutdown(struct batadv_tp_vars *tp_vars, + enum batadv_tp_meter_reason reason) +{ + if (!atomic_dec_and_test(&tp_vars->sending)) + return; + + tp_vars->reason = reason; +} + +/** + * batadv_tp_sender_finish - stop sender session after test_length was reached + * @work: delayed work reference of the related tp_vars + */ +static void batadv_tp_sender_finish(struct work_struct *work) +{ + struct delayed_work *delayed_work; + struct batadv_tp_vars *tp_vars; + + delayed_work = to_delayed_work(work); + tp_vars = container_of(delayed_work, struct batadv_tp_vars, + finish_work); + + batadv_tp_sender_shutdown(tp_vars, BATADV_TP_REASON_COMPLETE); +} + +/** + * batadv_tp_reset_sender_timer - reschedule the sender timer + * @tp_vars: the private TP meter data for this session + * + * Reschedule the timer using tp_vars->rto as delay + */ +static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars) +{ + /* most of the time this function is invoked while normal packet + * reception... + */ + if (unlikely(atomic_read(&tp_vars->sending) == 0)) + /* timer ref will be dropped in batadv_tp_sender_cleanup */ + return; + + mod_timer(&tp_vars->timer, jiffies + msecs_to_jiffies(tp_vars->rto)); +} + +/** + * batadv_tp_sender_timeout - timer that fires in case of packet loss + * @arg: address of the related tp_vars + * + * If fired it means that there was packet loss. + * Switch to Slow Start, set the ss_threshold to half of the current cwnd and + * reset the cwnd to 3*MSS + */ +static void batadv_tp_sender_timeout(unsigned long arg) +{ + struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg; + struct batadv_priv *bat_priv = tp_vars->bat_priv; + + if (atomic_read(&tp_vars->sending) == 0) + return; + + /* if the user waited long enough...shutdown the test */ + if (unlikely(tp_vars->rto >= BATADV_TP_MAX_RTO)) { + batadv_tp_sender_shutdown(tp_vars, + BATADV_TP_REASON_DST_UNREACHABLE); + return; + } + + /* RTO exponential backoff + * Details in Section 5.5 of RFC6298 + */ + tp_vars->rto <<= 1; + + spin_lock_bh(&tp_vars->cwnd_lock); + + tp_vars->ss_threshold = tp_vars->cwnd >> 1; + if (tp_vars->ss_threshold < BATADV_TP_PLEN * 2) + tp_vars->ss_threshold = BATADV_TP_PLEN * 2; + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: RTO fired during test towards %pM! cwnd=%u new ss_thr=%u, resetting last_sent to %u\n", + tp_vars->other_end, tp_vars->cwnd, tp_vars->ss_threshold, + atomic_read(&tp_vars->last_acked)); + + tp_vars->cwnd = BATADV_TP_PLEN * 3; + + spin_unlock_bh(&tp_vars->cwnd_lock); + + /* resend the non-ACKed packets.. */ + tp_vars->last_sent = atomic_read(&tp_vars->last_acked); + wake_up(&tp_vars->more_bytes); + + batadv_tp_reset_sender_timer(tp_vars); +} + +/** + * batadv_tp_fill_prerandom - Fill buffer with prefetched random bytes + * @tp_vars: the private TP meter data for this session + * @buf: Buffer to fill with bytes + * @nbytes: amount of pseudorandom bytes + */ +static void batadv_tp_fill_prerandom(struct batadv_tp_vars *tp_vars, + u8 *buf, size_t nbytes) +{ + u32 local_offset; + size_t bytes_inbuf; + size_t to_copy; + size_t pos = 0; + + spin_lock_bh(&tp_vars->prerandom_lock); + local_offset = tp_vars->prerandom_offset; + tp_vars->prerandom_offset += nbytes; + tp_vars->prerandom_offset %= sizeof(batadv_tp_prerandom); + spin_unlock_bh(&tp_vars->prerandom_lock); + + while (nbytes) { + local_offset %= sizeof(batadv_tp_prerandom); + bytes_inbuf = sizeof(batadv_tp_prerandom) - local_offset; + to_copy = min(nbytes, bytes_inbuf); + + memcpy(&buf[pos], &batadv_tp_prerandom[local_offset], to_copy); + pos += to_copy; + nbytes -= to_copy; + local_offset = 0; + } +} + +/** + * batadv_tp_send_msg - send a single message + * @tp_vars: the private TP meter data for this session + * @src: source mac address + * @orig_node: the originator of the destination + * @seqno: sequence number of this packet + * @len: length of the entire packet + * @session: session identifier + * @uid: local ICMP "socket" index + * @timestamp: timestamp in jiffies which is replied in ack + * + * Create and send a single TP Meter message. + * + * Return: 0 on success, BATADV_TP_REASON_DST_UNREACHABLE if the destination is + * not reachable, BATADV_TP_REASON_MEMORY_ERROR if the packet couldn't be + * allocated + */ +static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src, + struct batadv_orig_node *orig_node, + u32 seqno, size_t len, const u8 *session, + int uid, u32 timestamp) +{ + struct batadv_icmp_tp_packet *icmp; + struct sk_buff *skb; + int r; + u8 *data; + size_t data_len; + + skb = netdev_alloc_skb_ip_align(NULL, len + ETH_HLEN); + if (unlikely(!skb)) + return BATADV_TP_REASON_MEMORY_ERROR; + + skb_reserve(skb, ETH_HLEN); + icmp = (struct batadv_icmp_tp_packet *)skb_put(skb, sizeof(*icmp)); + + /* fill the icmp header */ + ether_addr_copy(icmp->dst, orig_node->orig); + ether_addr_copy(icmp->orig, src); + icmp->version = BATADV_COMPAT_VERSION; + icmp->packet_type = BATADV_ICMP; + icmp->ttl = BATADV_TTL; + icmp->msg_type = BATADV_TP; + icmp->uid = uid; + + icmp->subtype = BATADV_TP_MSG; + memcpy(icmp->session, session, sizeof(icmp->session)); + icmp->seqno = htonl(seqno); + icmp->timestamp = htonl(timestamp); + + data_len = len - sizeof(*icmp); + data = (u8 *)skb_put(skb, data_len); + batadv_tp_fill_prerandom(tp_vars, data, data_len); + + r = batadv_send_skb_to_orig(skb, orig_node, NULL); + if (r == -1) + kfree_skb(skb); + + if (r == NET_XMIT_SUCCESS) + return 0; + + return BATADV_TP_REASON_CANT_SEND; +} + +/** + * batadv_tp_recv_ack - ACK receiving function + * @bat_priv: the bat priv with all the soft interface information + * @skb: the buffer containing the received packet + * + * Process a received TP ACK packet + */ +static void batadv_tp_recv_ack(struct batadv_priv *bat_priv, + const struct sk_buff *skb) +{ + struct batadv_hard_iface *primary_if = NULL; + struct batadv_orig_node *orig_node = NULL; + const struct batadv_icmp_tp_packet *icmp; + struct batadv_tp_vars *tp_vars; + size_t packet_len, mss; + u32 rtt, recv_ack, cwnd; + unsigned char *dev_addr; + + packet_len = BATADV_TP_PLEN; + mss = BATADV_TP_PLEN; + packet_len += sizeof(struct batadv_unicast_packet); + + icmp = (struct batadv_icmp_tp_packet *)skb->data; + + /* find the tp_vars */ + tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, + icmp->session); + if (unlikely(!tp_vars)) + return; + + if (unlikely(atomic_read(&tp_vars->sending) == 0)) + goto out; + + /* old ACK? silently drop it.. */ + if (batadv_seq_before(ntohl(icmp->seqno), + (u32)atomic_read(&tp_vars->last_acked))) + goto out; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (unlikely(!primary_if)) + goto out; + + orig_node = batadv_orig_hash_find(bat_priv, icmp->orig); + if (unlikely(!orig_node)) + goto out; + + /* update RTO with the new sampled RTT, if any */ + rtt = jiffies_to_msecs(jiffies) - ntohl(icmp->timestamp); + if (icmp->timestamp && rtt) + batadv_tp_update_rto(tp_vars, rtt); + + /* ACK for new data... reset the timer */ + batadv_tp_reset_sender_timer(tp_vars); + + recv_ack = ntohl(icmp->seqno); + + /* check if this ACK is a duplicate */ + if (atomic_read(&tp_vars->last_acked) == recv_ack) { + atomic_inc(&tp_vars->dup_acks); + if (atomic_read(&tp_vars->dup_acks) != 3) + goto out; + + if (recv_ack >= tp_vars->recover) + goto out; + + /* if this is the third duplicate ACK do Fast Retransmit */ + batadv_tp_send_msg(tp_vars, primary_if->net_dev->dev_addr, + orig_node, recv_ack, packet_len, + icmp->session, icmp->uid, + jiffies_to_msecs(jiffies)); + + spin_lock_bh(&tp_vars->cwnd_lock); + + /* Fast Recovery */ + tp_vars->fast_recovery = true; + /* Set recover to the last outstanding seqno when Fast Recovery + * is entered. RFC6582, Section 3.2, step 1 + */ + tp_vars->recover = tp_vars->last_sent; + tp_vars->ss_threshold = tp_vars->cwnd >> 1; + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: Fast Recovery, (cur cwnd=%u) ss_thr=%u last_sent=%u recv_ack=%u\n", + tp_vars->cwnd, tp_vars->ss_threshold, + tp_vars->last_sent, recv_ack); + tp_vars->cwnd = batadv_tp_cwnd(tp_vars->ss_threshold, 3 * mss, + mss); + tp_vars->dec_cwnd = 0; + tp_vars->last_sent = recv_ack; + + spin_unlock_bh(&tp_vars->cwnd_lock); + } else { + /* count the acked data */ + atomic64_add(recv_ack - atomic_read(&tp_vars->last_acked), + &tp_vars->tot_sent); + /* reset the duplicate ACKs counter */ + atomic_set(&tp_vars->dup_acks, 0); + + if (tp_vars->fast_recovery) { + /* partial ACK */ + if (batadv_seq_before(recv_ack, tp_vars->recover)) { + /* this is another hole in the window. React + * immediately as specified by NewReno (see + * Section 3.2 of RFC6582 for details) + */ + dev_addr = primary_if->net_dev->dev_addr; + batadv_tp_send_msg(tp_vars, dev_addr, + orig_node, recv_ack, + packet_len, icmp->session, + icmp->uid, + jiffies_to_msecs(jiffies)); + tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, + mss, mss); + } else { + tp_vars->fast_recovery = false; + /* set cwnd to the value of ss_threshold at the + * moment that Fast Recovery was entered. + * RFC6582, Section 3.2, step 3 + */ + cwnd = batadv_tp_cwnd(tp_vars->ss_threshold, 0, + mss); + tp_vars->cwnd = cwnd; + } + goto move_twnd; + } + + if (recv_ack - atomic_read(&tp_vars->last_acked) >= mss) + batadv_tp_update_cwnd(tp_vars, mss); +move_twnd: + /* move the Transmit Window */ + atomic_set(&tp_vars->last_acked, recv_ack); + } + + wake_up(&tp_vars->more_bytes); +out: + if (likely(primary_if)) + batadv_hardif_put(primary_if); + if (likely(orig_node)) + batadv_orig_node_put(orig_node); + if (likely(tp_vars)) + batadv_tp_vars_put(tp_vars); +} + +/** + * batadv_tp_avail - check if congestion window is not full + * @tp_vars: the private data of the current TP meter session + * @payload_len: size of the payload of a single message + * + * Return: true when congestion window is not full, false otherwise + */ +static bool batadv_tp_avail(struct batadv_tp_vars *tp_vars, + size_t payload_len) +{ + u32 win_left, win_limit; + + win_limit = atomic_read(&tp_vars->last_acked) + tp_vars->cwnd; + win_left = win_limit - tp_vars->last_sent; + + return win_left >= payload_len; +} + +/** + * batadv_tp_wait_available - wait until congestion window becomes free or + * timeout is reached + * @tp_vars: the private data of the current TP meter session + * @plen: size of the payload of a single message + * + * Return: 0 if the condition evaluated to false after the timeout elapsed, + * 1 if the condition evaluated to true after the timeout elapsed, the + * remaining jiffies (at least 1) if the condition evaluated to true before + * the timeout elapsed, or -ERESTARTSYS if it was interrupted by a signal. + */ +static int batadv_tp_wait_available(struct batadv_tp_vars *tp_vars, size_t plen) +{ + int ret; + + ret = wait_event_interruptible_timeout(tp_vars->more_bytes, + batadv_tp_avail(tp_vars, plen), + HZ / 10); + + return ret; +} + +/** + * batadv_tp_send - main sending thread of a tp meter session + * @arg: address of the related tp_vars + * + * Return: nothing, this function never returns + */ +static int batadv_tp_send(void *arg) +{ + struct batadv_tp_vars *tp_vars = arg; + struct batadv_priv *bat_priv = tp_vars->bat_priv; + struct batadv_hard_iface *primary_if = NULL; + struct batadv_orig_node *orig_node = NULL; + size_t payload_len, packet_len; + int err = 0; + + if (unlikely(tp_vars->role != BATADV_TP_SENDER)) { + err = BATADV_TP_REASON_DST_UNREACHABLE; + tp_vars->reason = err; + goto out; + } + + orig_node = batadv_orig_hash_find(bat_priv, tp_vars->other_end); + if (unlikely(!orig_node)) { + err = BATADV_TP_REASON_DST_UNREACHABLE; + tp_vars->reason = err; + goto out; + } + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (unlikely(!primary_if)) { + err = BATADV_TP_REASON_DST_UNREACHABLE; + goto out; + } + + /* assume that all the hard_interfaces have a correctly + * configured MTU, so use the soft_iface MTU as MSS. + * This might not be true and in that case the fragmentation + * should be used. + * Now, try to send the packet as it is + */ + payload_len = BATADV_TP_PLEN; + BUILD_BUG_ON(sizeof(struct batadv_icmp_tp_packet) > BATADV_TP_PLEN); + + batadv_tp_reset_sender_timer(tp_vars); + + /* queue the worker in charge of terminating the test */ + queue_delayed_work(batadv_event_workqueue, &tp_vars->finish_work, + msecs_to_jiffies(tp_vars->test_length)); + + while (atomic_read(&tp_vars->sending) != 0) { + if (unlikely(!batadv_tp_avail(tp_vars, payload_len))) { + batadv_tp_wait_available(tp_vars, payload_len); + continue; + } + + /* to emulate normal unicast traffic, add to the payload len + * the size of the unicast header + */ + packet_len = payload_len + sizeof(struct batadv_unicast_packet); + + err = batadv_tp_send_msg(tp_vars, primary_if->net_dev->dev_addr, + orig_node, tp_vars->last_sent, + packet_len, + tp_vars->session, tp_vars->icmp_uid, + jiffies_to_msecs(jiffies)); + + /* something went wrong during the preparation/transmission */ + if (unlikely(err && err != BATADV_TP_REASON_CANT_SEND)) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: batadv_tp_send() cannot send packets (%d)\n", + err); + /* ensure nobody else tries to stop the thread now */ + if (atomic_dec_and_test(&tp_vars->sending)) + tp_vars->reason = err; + break; + } + + /* right-shift the TWND */ + if (!err) + tp_vars->last_sent += payload_len; + + cond_resched(); + } + +out: + if (likely(primary_if)) + batadv_hardif_put(primary_if); + if (likely(orig_node)) + batadv_orig_node_put(orig_node); + + batadv_tp_sender_end(bat_priv, tp_vars); + batadv_tp_sender_cleanup(bat_priv, tp_vars); + + batadv_tp_vars_put(tp_vars); + + do_exit(0); +} + +/** + * batadv_tp_start_kthread - start new thread which manages the tp meter sender + * @tp_vars: the private data of the current TP meter session + */ +static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars) +{ + struct task_struct *kthread; + struct batadv_priv *bat_priv = tp_vars->bat_priv; + u32 session_cookie; + + kref_get(&tp_vars->refcount); + kthread = kthread_create(batadv_tp_send, tp_vars, "kbatadv_tp_meter"); + if (IS_ERR(kthread)) { + session_cookie = batadv_tp_session_cookie(tp_vars->session, + tp_vars->icmp_uid); + pr_err("batadv: cannot create tp meter kthread\n"); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_MEMORY_ERROR, + tp_vars->other_end, + bat_priv, session_cookie); + + /* drop reserved reference for kthread */ + batadv_tp_vars_put(tp_vars); + + /* cleanup of failed tp meter variables */ + batadv_tp_sender_cleanup(bat_priv, tp_vars); + return; + } + + wake_up_process(kthread); +} + +/** + * batadv_tp_start - start a new tp meter session + * @bat_priv: the bat priv with all the soft interface information + * @dst: the receiver MAC address + * @test_length: test length in milliseconds + * @cookie: session cookie + */ +void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, + u32 test_length, u32 *cookie) +{ + struct batadv_tp_vars *tp_vars; + u8 session_id[2]; + u8 icmp_uid; + u32 session_cookie; + + get_random_bytes(session_id, sizeof(session_id)); + get_random_bytes(&icmp_uid, 1); + session_cookie = batadv_tp_session_cookie(session_id, icmp_uid); + *cookie = session_cookie; + + /* look for an already existing test towards this node */ + spin_lock_bh(&bat_priv->tp_list_lock); + tp_vars = batadv_tp_list_find(bat_priv, dst); + if (tp_vars) { + spin_unlock_bh(&bat_priv->tp_list_lock); + batadv_tp_vars_put(tp_vars); + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: test to or from the same node already ongoing, aborting\n"); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_ALREADY_ONGOING, + dst, bat_priv, session_cookie); + return; + } + + if (!atomic_add_unless(&bat_priv->tp_num, 1, BATADV_TP_MAX_NUM)) { + spin_unlock_bh(&bat_priv->tp_list_lock); + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: too many ongoing sessions, aborting (SEND)\n"); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_TOO_MANY, dst, + bat_priv, session_cookie); + return; + } + + tp_vars = kmalloc(sizeof(*tp_vars), GFP_ATOMIC); + if (!tp_vars) { + spin_unlock_bh(&bat_priv->tp_list_lock); + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: batadv_tp_start cannot allocate list elements\n"); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_MEMORY_ERROR, + dst, bat_priv, session_cookie); + return; + } + + /* initialize tp_vars */ + ether_addr_copy(tp_vars->other_end, dst); + kref_init(&tp_vars->refcount); + tp_vars->role = BATADV_TP_SENDER; + atomic_set(&tp_vars->sending, 1); + memcpy(tp_vars->session, session_id, sizeof(session_id)); + tp_vars->icmp_uid = icmp_uid; + + tp_vars->last_sent = BATADV_TP_FIRST_SEQ; + atomic_set(&tp_vars->last_acked, BATADV_TP_FIRST_SEQ); + tp_vars->fast_recovery = false; + tp_vars->recover = BATADV_TP_FIRST_SEQ; + + /* initialise the CWND to 3*MSS (Section 3.1 in RFC5681). + * For batman-adv the MSS is the size of the payload received by the + * soft_interface, hence its MTU + */ + tp_vars->cwnd = BATADV_TP_PLEN * 3; + /* at the beginning initialise the SS threshold to the biggest possible + * window size, hence the AWND size + */ + tp_vars->ss_threshold = BATADV_TP_AWND; + + /* RTO initial value is 3 seconds. + * Details in Section 2.1 of RFC6298 + */ + tp_vars->rto = 1000; + tp_vars->srtt = 0; + tp_vars->rttvar = 0; + + atomic64_set(&tp_vars->tot_sent, 0); + + kref_get(&tp_vars->refcount); + setup_timer(&tp_vars->timer, batadv_tp_sender_timeout, + (unsigned long)tp_vars); + + tp_vars->bat_priv = bat_priv; + tp_vars->start_time = jiffies; + + init_waitqueue_head(&tp_vars->more_bytes); + + spin_lock_init(&tp_vars->unacked_lock); + INIT_LIST_HEAD(&tp_vars->unacked_list); + + spin_lock_init(&tp_vars->cwnd_lock); + + tp_vars->prerandom_offset = 0; + spin_lock_init(&tp_vars->prerandom_lock); + + kref_get(&tp_vars->refcount); + hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list); + spin_unlock_bh(&bat_priv->tp_list_lock); + + tp_vars->test_length = test_length; + if (!tp_vars->test_length) + tp_vars->test_length = BATADV_TP_DEF_TEST_LENGTH; + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: starting throughput meter towards %pM (length=%ums)\n", + dst, test_length); + + /* init work item for finished tp tests */ + INIT_DELAYED_WORK(&tp_vars->finish_work, batadv_tp_sender_finish); + + /* start tp kthread. This way the write() call issued from userspace can + * happily return and avoid to block + */ + batadv_tp_start_kthread(tp_vars); + + /* don't return reference to new tp_vars */ + batadv_tp_vars_put(tp_vars); +} + +/** + * batadv_tp_stop - stop currently running tp meter session + * @bat_priv: the bat priv with all the soft interface information + * @dst: the receiver MAC address + * @return_value: reason for tp meter session stop + */ +void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst, + u8 return_value) +{ + struct batadv_orig_node *orig_node; + struct batadv_tp_vars *tp_vars; + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: stopping test towards %pM\n", dst); + + orig_node = batadv_orig_hash_find(bat_priv, dst); + if (!orig_node) + return; + + tp_vars = batadv_tp_list_find(bat_priv, orig_node->orig); + if (!tp_vars) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: trying to interrupt an already over connection\n"); + goto out; + } + + batadv_tp_sender_shutdown(tp_vars, return_value); + batadv_tp_vars_put(tp_vars); +out: + batadv_orig_node_put(orig_node); +} + +/** + * batadv_tp_reset_receiver_timer - reset the receiver shutdown timer + * @tp_vars: the private data of the current TP meter session + * + * start the receiver shutdown timer or reset it if already started + */ +static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars) +{ + mod_timer(&tp_vars->timer, + jiffies + msecs_to_jiffies(BATADV_TP_RECV_TIMEOUT)); +} + +/** + * batadv_tp_receiver_shutdown - stop a tp meter receiver when timeout is + * reached without received ack + * @arg: address of the related tp_vars + */ +static void batadv_tp_receiver_shutdown(unsigned long arg) +{ + struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg; + struct batadv_tp_unacked *un, *safe; + struct batadv_priv *bat_priv; + + bat_priv = tp_vars->bat_priv; + + /* if there is recent activity rearm the timer */ + if (!batadv_has_timed_out(tp_vars->last_recv_time, + BATADV_TP_RECV_TIMEOUT)) { + /* reset the receiver shutdown timer */ + batadv_tp_reset_receiver_timer(tp_vars); + return; + } + + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Shutting down for inactivity (more than %dms) from %pM\n", + BATADV_TP_RECV_TIMEOUT, tp_vars->other_end); + + spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); + hlist_del_rcu(&tp_vars->list); + spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); + + /* drop list reference */ + batadv_tp_vars_put(tp_vars); + + atomic_dec(&bat_priv->tp_num); + + spin_lock_bh(&tp_vars->unacked_lock); + list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { + list_del(&un->list); + kfree(un); + } + spin_unlock_bh(&tp_vars->unacked_lock); + + /* drop reference of timer */ + batadv_tp_vars_put(tp_vars); +} + +/** + * batadv_tp_send_ack - send an ACK packet + * @bat_priv: the bat priv with all the soft interface information + * @dst: the mac address of the destination originator + * @seq: the sequence number to ACK + * @timestamp: the timestamp to echo back in the ACK + * @session: session identifier + * @socket_index: local ICMP socket identifier + * + * Return: 0 on success, a positive integer representing the reason of the + * failure otherwise + */ +static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst, + u32 seq, __be32 timestamp, const u8 *session, + int socket_index) +{ + struct batadv_hard_iface *primary_if = NULL; + struct batadv_orig_node *orig_node; + struct batadv_icmp_tp_packet *icmp; + struct sk_buff *skb; + int r, ret; + + orig_node = batadv_orig_hash_find(bat_priv, dst); + if (unlikely(!orig_node)) { + ret = BATADV_TP_REASON_DST_UNREACHABLE; + goto out; + } + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (unlikely(!primary_if)) { + ret = BATADV_TP_REASON_DST_UNREACHABLE; + goto out; + } + + skb = netdev_alloc_skb_ip_align(NULL, sizeof(*icmp) + ETH_HLEN); + if (unlikely(!skb)) { + ret = BATADV_TP_REASON_MEMORY_ERROR; + goto out; + } + + skb_reserve(skb, ETH_HLEN); + icmp = (struct batadv_icmp_tp_packet *)skb_put(skb, sizeof(*icmp)); + icmp->packet_type = BATADV_ICMP; + icmp->version = BATADV_COMPAT_VERSION; + icmp->ttl = BATADV_TTL; + icmp->msg_type = BATADV_TP; + ether_addr_copy(icmp->dst, orig_node->orig); + ether_addr_copy(icmp->orig, primary_if->net_dev->dev_addr); + icmp->uid = socket_index; + + icmp->subtype = BATADV_TP_ACK; + memcpy(icmp->session, session, sizeof(icmp->session)); + icmp->seqno = htonl(seq); + icmp->timestamp = timestamp; + + /* send the ack */ + r = batadv_send_skb_to_orig(skb, orig_node, NULL); + if (r == -1) + kfree_skb(skb); + + if (unlikely(r < 0) || (r == NET_XMIT_DROP)) { + ret = BATADV_TP_REASON_DST_UNREACHABLE; + goto out; + } + ret = 0; + +out: + if (likely(orig_node)) + batadv_orig_node_put(orig_node); + if (likely(primary_if)) + batadv_hardif_put(primary_if); + + return ret; +} + +/** + * batadv_tp_handle_out_of_order - store an out of order packet + * @tp_vars: the private data of the current TP meter session + * @skb: the buffer containing the received packet + * + * Store the out of order packet in the unacked list for late processing. This + * packets are kept in this list so that they can be ACKed at once as soon as + * all the previous packets have been received + * + * Return: true if the packed has been successfully processed, false otherwise + */ +static bool batadv_tp_handle_out_of_order(struct batadv_tp_vars *tp_vars, + const struct sk_buff *skb) +{ + const struct batadv_icmp_tp_packet *icmp; + struct batadv_tp_unacked *un, *new; + u32 payload_len; + bool added = false; + + new = kmalloc(sizeof(*new), GFP_ATOMIC); + if (unlikely(!new)) + return false; + + icmp = (struct batadv_icmp_tp_packet *)skb->data; + + new->seqno = ntohl(icmp->seqno); + payload_len = skb->len - sizeof(struct batadv_unicast_packet); + new->len = payload_len; + + spin_lock_bh(&tp_vars->unacked_lock); + /* if the list is empty immediately attach this new object */ + if (list_empty(&tp_vars->unacked_list)) { + list_add(&new->list, &tp_vars->unacked_list); + goto out; + } + + /* otherwise loop over the list and either drop the packet because this + * is a duplicate or store it at the right position. + * + * The iteration is done in the reverse way because it is likely that + * the last received packet (the one being processed now) has a bigger + * seqno than all the others already stored. + */ + list_for_each_entry_reverse(un, &tp_vars->unacked_list, list) { + /* check for duplicates */ + if (new->seqno == un->seqno) { + if (new->len > un->len) + un->len = new->len; + kfree(new); + added = true; + break; + } + + /* look for the right position */ + if (batadv_seq_before(new->seqno, un->seqno)) + continue; + + /* as soon as an entry having a bigger seqno is found, the new + * one is attached _after_ it. In this way the list is kept in + * ascending order + */ + list_add_tail(&new->list, &un->list); + added = true; + break; + } + + /* received packet with smallest seqno out of order; add it to front */ + if (!added) + list_add(&new->list, &tp_vars->unacked_list); + +out: + spin_unlock_bh(&tp_vars->unacked_lock); + + return true; +} + +/** + * batadv_tp_ack_unordered - update number received bytes in current stream + * without gaps + * @tp_vars: the private data of the current TP meter session + */ +static void batadv_tp_ack_unordered(struct batadv_tp_vars *tp_vars) +{ + struct batadv_tp_unacked *un, *safe; + u32 to_ack; + + /* go through the unacked packet list and possibly ACK them as + * well + */ + spin_lock_bh(&tp_vars->unacked_lock); + list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { + /* the list is ordered, therefore it is possible to stop as soon + * there is a gap between the last acked seqno and the seqno of + * the packet under inspection + */ + if (batadv_seq_before(tp_vars->last_recv, un->seqno)) + break; + + to_ack = un->seqno + un->len - tp_vars->last_recv; + + if (batadv_seq_before(tp_vars->last_recv, un->seqno + un->len)) + tp_vars->last_recv += to_ack; + + list_del(&un->list); + kfree(un); + } + spin_unlock_bh(&tp_vars->unacked_lock); +} + +/** + * batadv_tp_init_recv - return matching or create new receiver tp_vars + * @bat_priv: the bat priv with all the soft interface information + * @icmp: received icmp tp msg + * + * Return: corresponding tp_vars or NULL on errors + */ +static struct batadv_tp_vars * +batadv_tp_init_recv(struct batadv_priv *bat_priv, + const struct batadv_icmp_tp_packet *icmp) +{ + struct batadv_tp_vars *tp_vars; + + spin_lock_bh(&bat_priv->tp_list_lock); + tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, + icmp->session); + if (tp_vars) + goto out_unlock; + + if (!atomic_add_unless(&bat_priv->tp_num, 1, BATADV_TP_MAX_NUM)) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: too many ongoing sessions, aborting (RECV)\n"); + goto out_unlock; + } + + tp_vars = kmalloc(sizeof(*tp_vars), GFP_ATOMIC); + if (!tp_vars) + goto out_unlock; + + ether_addr_copy(tp_vars->other_end, icmp->orig); + tp_vars->role = BATADV_TP_RECEIVER; + memcpy(tp_vars->session, icmp->session, sizeof(tp_vars->session)); + tp_vars->last_recv = BATADV_TP_FIRST_SEQ; + tp_vars->bat_priv = bat_priv; + kref_init(&tp_vars->refcount); + + spin_lock_init(&tp_vars->unacked_lock); + INIT_LIST_HEAD(&tp_vars->unacked_list); + + kref_get(&tp_vars->refcount); + hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list); + + kref_get(&tp_vars->refcount); + setup_timer(&tp_vars->timer, batadv_tp_receiver_shutdown, + (unsigned long)tp_vars); + + batadv_tp_reset_receiver_timer(tp_vars); + +out_unlock: + spin_unlock_bh(&bat_priv->tp_list_lock); + + return tp_vars; +} + +/** + * batadv_tp_recv_msg - process a single data message + * @bat_priv: the bat priv with all the soft interface information + * @skb: the buffer containing the received packet + * + * Process a received TP MSG packet + */ +static void batadv_tp_recv_msg(struct batadv_priv *bat_priv, + const struct sk_buff *skb) +{ + const struct batadv_icmp_tp_packet *icmp; + struct batadv_tp_vars *tp_vars; + size_t packet_size; + u32 seqno; + + icmp = (struct batadv_icmp_tp_packet *)skb->data; + + seqno = ntohl(icmp->seqno); + /* check if this is the first seqno. This means that if the + * first packet is lost, the tp meter does not work anymore! + */ + if (seqno == BATADV_TP_FIRST_SEQ) { + tp_vars = batadv_tp_init_recv(bat_priv, icmp); + if (!tp_vars) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: seqno != BATADV_TP_FIRST_SEQ cannot initiate connection\n"); + goto out; + } + } else { + tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, + icmp->session); + if (!tp_vars) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Unexpected packet from %pM!\n", + icmp->orig); + goto out; + } + } + + if (unlikely(tp_vars->role != BATADV_TP_RECEIVER)) { + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Meter: dropping packet: not expected (role=%u)\n", + tp_vars->role); + goto out; + } + + tp_vars->last_recv_time = jiffies; + + /* if the packet is a duplicate, it may be the case that an ACK has been + * lost. Resend the ACK + */ + if (batadv_seq_before(seqno, tp_vars->last_recv)) + goto send_ack; + + /* if the packet is out of order enqueue it */ + if (ntohl(icmp->seqno) != tp_vars->last_recv) { + /* exit immediately (and do not send any ACK) if the packet has + * not been enqueued correctly + */ + if (!batadv_tp_handle_out_of_order(tp_vars, skb)) + goto out; + + /* send a duplicate ACK */ + goto send_ack; + } + + /* if everything was fine count the ACKed bytes */ + packet_size = skb->len - sizeof(struct batadv_unicast_packet); + tp_vars->last_recv += packet_size; + + /* check if this ordered message filled a gap.... */ + batadv_tp_ack_unordered(tp_vars); + +send_ack: + /* send the ACK. If the received packet was out of order, the ACK that + * is going to be sent is a duplicate (the sender will count them and + * possibly enter Fast Retransmit as soon as it has reached 3) + */ + batadv_tp_send_ack(bat_priv, icmp->orig, tp_vars->last_recv, + icmp->timestamp, icmp->session, icmp->uid); +out: + if (likely(tp_vars)) + batadv_tp_vars_put(tp_vars); +} + +/** + * batadv_tp_meter_recv - main TP Meter receiving function + * @bat_priv: the bat priv with all the soft interface information + * @skb: the buffer containing the received packet + */ +void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) +{ + struct batadv_icmp_tp_packet *icmp; + + icmp = (struct batadv_icmp_tp_packet *)skb->data; + + switch (icmp->subtype) { + case BATADV_TP_MSG: + batadv_tp_recv_msg(bat_priv, skb); + break; + case BATADV_TP_ACK: + batadv_tp_recv_ack(bat_priv, skb); + break; + default: + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, + "Received unknown TP Metric packet type %u\n", + icmp->subtype); + } + consume_skb(skb); +} + +/** + * batadv_tp_meter_init - initialize global tp_meter structures + */ +void batadv_tp_meter_init(void) +{ + get_random_bytes(batadv_tp_prerandom, sizeof(batadv_tp_prerandom)); +} diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h new file mode 100644 index 000000000000..ba922c425e56 --- /dev/null +++ b/net/batman-adv/tp_meter.h @@ -0,0 +1,34 @@ +/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: + * + * Edo Monticelli, Antonio Quartulli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _NET_BATMAN_ADV_TP_METER_H_ +#define _NET_BATMAN_ADV_TP_METER_H_ + +#include "main.h" + +#include + +struct sk_buff; + +void batadv_tp_meter_init(void); +void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, + u32 test_length, u32 *cookie); +void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst, + u8 return_value); +void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb); + +#endif /* _NET_BATMAN_ADV_TP_METER_H_ */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ab863a5ab2b8..a331e3ab93d1 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -33,6 +33,7 @@ #include #include #include +#include #include "packet.h" @@ -832,6 +833,111 @@ struct batadv_priv_nc { struct batadv_hashtable *decoding_hash; }; +/** + * struct batadv_tp_unacked - unacked packet meta-information + * @seqno: seqno of the unacked packet + * @len: length of the packet + * @list: list node for batadv_tp_vars::unacked_list + * + * This struct is supposed to represent a buffer unacked packet. However, since + * the purpose of the TP meter is to count the traffic only, there is no need to + * store the entire sk_buff, the starting offset and the length are enough + */ +struct batadv_tp_unacked { + u32 seqno; + u16 len; + struct list_head list; +}; + +/** + * enum batadv_tp_meter_role - Modus in tp meter session + * @BATADV_TP_RECEIVER: Initialized as receiver + * @BATADV_TP_SENDER: Initialized as sender + */ +enum batadv_tp_meter_role { + BATADV_TP_RECEIVER, + BATADV_TP_SENDER +}; + +/** + * struct batadv_tp_vars - tp meter private variables per session + * @list: list node for bat_priv::tp_list + * @timer: timer for ack (receiver) and retry (sender) + * @bat_priv: pointer to the mesh object + * @start_time: start time in jiffies + * @other_end: mac address of remote + * @role: receiver/sender modi + * @sending: sending binary semaphore: 1 if sending, 0 is not + * @reason: reason for a stopped session + * @finish_work: work item for the finishing procedure + * @test_length: test length in milliseconds + * @session: TP session identifier + * @icmp_uid: local ICMP "socket" index + * @dec_cwnd: decimal part of the cwnd used during linear growth + * @cwnd: current size of the congestion window + * @cwnd_lock: lock do protect @cwnd & @dec_cwnd + * @ss_threshold: Slow Start threshold. Once cwnd exceeds this value the + * connection switches to the Congestion Avoidance state + * @last_acked: last acked byte + * @last_sent: last sent byte, not yet acked + * @tot_sent: amount of data sent/ACKed so far + * @dup_acks: duplicate ACKs counter + * @fast_recovery: true if in Fast Recovery mode + * @recover: last sent seqno when entering Fast Recovery + * @rto: sender timeout + * @srtt: smoothed RTT scaled by 2^3 + * @rttvar: RTT variation scaled by 2^2 + * @more_bytes: waiting queue anchor when waiting for more ack/retry timeout + * @prerandom_offset: offset inside the prerandom buffer + * @prerandom_lock: spinlock protecting access to prerandom_offset + * @last_recv: last in-order received packet + * @unacked_list: list of unacked packets (meta-info only) + * @unacked_lock: protect unacked_list + * @last_recv_time: time time (jiffies) a msg was received + * @refcount: number of context where the object is used + * @rcu: struct used for freeing in an RCU-safe manner + */ +struct batadv_tp_vars { + struct hlist_node list; + struct timer_list timer; + struct batadv_priv *bat_priv; + unsigned long start_time; + u8 other_end[ETH_ALEN]; + enum batadv_tp_meter_role role; + atomic_t sending; + enum batadv_tp_meter_reason reason; + struct delayed_work finish_work; + u32 test_length; + u8 session[2]; + u8 icmp_uid; + + /* sender variables */ + u16 dec_cwnd; + u32 cwnd; + spinlock_t cwnd_lock; /* Protects cwnd & dec_cwnd */ + u32 ss_threshold; + atomic_t last_acked; + u32 last_sent; + atomic64_t tot_sent; + atomic_t dup_acks; + bool fast_recovery; + u32 recover; + u32 rto; + u32 srtt; + u32 rttvar; + wait_queue_head_t more_bytes; + u32 prerandom_offset; + spinlock_t prerandom_lock; /* Protects prerandom_offset */ + + /* receiver variables */ + u32 last_recv; + struct list_head unacked_list; + spinlock_t unacked_lock; /* Protects unacked_list */ + unsigned long last_recv_time; + struct kref refcount; + struct rcu_head rcu; +}; + /** * struct batadv_softif_vlan - per VLAN attributes set * @bat_priv: pointer to the mesh object @@ -900,9 +1006,12 @@ struct batadv_priv_bat_v { * @debug_dir: dentry for debugfs batman-adv subdirectory * @forw_bat_list: list of aggregated OGMs that will be forwarded * @forw_bcast_list: list of broadcast packets that will be rebroadcasted + * @tp_list: list of tp sessions + * @tp_num: number of currently active tp sessions * @orig_hash: hash table containing mesh participants (orig nodes) * @forw_bat_list_lock: lock protecting forw_bat_list * @forw_bcast_list_lock: lock protecting forw_bcast_list + * @tp_list_lock: spinlock protecting @tp_list * @orig_work: work queue callback item for orig node purging * @cleanup_work: work queue callback item for soft-interface deinit * @primary_if: one of the hard-interfaces assigned to this mesh interface @@ -956,9 +1065,12 @@ struct batadv_priv { struct dentry *debug_dir; struct hlist_head forw_bat_list; struct hlist_head forw_bcast_list; + struct hlist_head tp_list; struct batadv_hashtable *orig_hash; spinlock_t forw_bat_list_lock; /* protects forw_bat_list */ spinlock_t forw_bcast_list_lock; /* protects forw_bcast_list */ + spinlock_t tp_list_lock; /* protects tp_list */ + atomic_t tp_num; struct delayed_work orig_work; struct work_struct cleanup_work; struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */ -- cgit v1.2.3 From 29824a55c07cd79a530d4bc1020a529c402515b6 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Wed, 25 May 2016 23:27:31 +0800 Subject: batman-adv: split routing API data structure in subobjects The routing API data structure contains several function pointers that can easily be grouped together based on the component they work with. Split the API in subobjects in order to improve definition readability. At the same time, remove the "bat_" prefix from the API object and its fields names. These are batman-adv private structs and there is no need to always prepend such prefix, which only makes function invocations much much longer. Signed-off-by: Antonio Quartulli Reviewed-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_algo.c | 14 ++--- net/batman-adv/bat_iv_ogm.c | 33 ++++++----- net/batman-adv/bat_v.c | 26 +++++---- net/batman-adv/bat_v_elp.c | 2 +- net/batman-adv/bat_v_ogm.c | 2 +- net/batman-adv/hard-interface.c | 16 +++--- net/batman-adv/netlink.c | 2 +- net/batman-adv/originator.c | 49 ++++++++-------- net/batman-adv/routing.c | 8 +-- net/batman-adv/sysfs.c | 2 +- net/batman-adv/translation-table.c | 6 +- net/batman-adv/types.h | 114 +++++++++++++++++++++---------------- 12 files changed, 150 insertions(+), 124 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 610d4de0f6b0..81dbbf569bd4 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -65,12 +65,12 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) } /* all algorithms must implement all ops (for now) */ - if (!bat_algo_ops->bat_iface_enable || - !bat_algo_ops->bat_iface_disable || - !bat_algo_ops->bat_iface_update_mac || - !bat_algo_ops->bat_primary_iface_set || - !bat_algo_ops->bat_neigh_cmp || - !bat_algo_ops->bat_neigh_is_similar_or_better) { + if (!bat_algo_ops->iface.enable || + !bat_algo_ops->iface.disable || + !bat_algo_ops->iface.update_mac || + !bat_algo_ops->iface.primary_set || + !bat_algo_ops->neigh.cmp || + !bat_algo_ops->neigh.is_similar_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", bat_algo_ops->name); return -EINVAL; @@ -90,7 +90,7 @@ int batadv_algo_select(struct batadv_priv *bat_priv, char *name) if (!bat_algo_ops) return -EINVAL; - bat_priv->bat_algo_ops = bat_algo_ops; + bat_priv->algo_ops = bat_algo_ops; return 0; } diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index e2d8848c32c0..19b0abd6c640 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1853,8 +1853,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface * that does not have B.A.T.M.A.N. IV enabled ? */ - if (bat_priv->bat_algo_ops->bat_iface_enable != - batadv_iv_ogm_iface_enable) + if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable) return NET_RX_DROP; batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); @@ -2120,18 +2119,24 @@ static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .name = "BATMAN_IV", - .bat_iface_activate = batadv_iv_iface_activate, - .bat_iface_enable = batadv_iv_ogm_iface_enable, - .bat_iface_disable = batadv_iv_ogm_iface_disable, - .bat_iface_update_mac = batadv_iv_ogm_iface_update_mac, - .bat_primary_iface_set = batadv_iv_ogm_primary_iface_set, - .bat_neigh_cmp = batadv_iv_ogm_neigh_cmp, - .bat_neigh_is_similar_or_better = batadv_iv_ogm_neigh_is_sob, - .bat_neigh_print = batadv_iv_neigh_print, - .bat_orig_print = batadv_iv_ogm_orig_print, - .bat_orig_free = batadv_iv_ogm_orig_free, - .bat_orig_add_if = batadv_iv_ogm_orig_add_if, - .bat_orig_del_if = batadv_iv_ogm_orig_del_if, + .iface = { + .activate = batadv_iv_iface_activate, + .enable = batadv_iv_ogm_iface_enable, + .disable = batadv_iv_ogm_iface_disable, + .update_mac = batadv_iv_ogm_iface_update_mac, + .primary_set = batadv_iv_ogm_primary_iface_set, + }, + .neigh = { + .cmp = batadv_iv_ogm_neigh_cmp, + .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, + .print = batadv_iv_neigh_print, + }, + .orig = { + .print = batadv_iv_ogm_orig_print, + .free = batadv_iv_ogm_orig_free, + .add_if = batadv_iv_ogm_orig_add_if, + .del_if = batadv_iv_ogm_orig_del_if, + }, }; int __init batadv_iv_init(void) diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 7231440bed51..0366cbf5e444 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -322,16 +322,22 @@ err_ifinfo1: static struct batadv_algo_ops batadv_batman_v __read_mostly = { .name = "BATMAN_V", - .bat_iface_activate = batadv_v_iface_activate, - .bat_iface_enable = batadv_v_iface_enable, - .bat_iface_disable = batadv_v_iface_disable, - .bat_iface_update_mac = batadv_v_iface_update_mac, - .bat_primary_iface_set = batadv_v_primary_iface_set, - .bat_hardif_neigh_init = batadv_v_hardif_neigh_init, - .bat_orig_print = batadv_v_orig_print, - .bat_neigh_cmp = batadv_v_neigh_cmp, - .bat_neigh_is_similar_or_better = batadv_v_neigh_is_sob, - .bat_neigh_print = batadv_v_neigh_print, + .iface = { + .activate = batadv_v_iface_activate, + .enable = batadv_v_iface_enable, + .disable = batadv_v_iface_disable, + .update_mac = batadv_v_iface_update_mac, + .primary_set = batadv_v_primary_iface_set, + }, + .neigh = { + .hardif_init = batadv_v_hardif_neigh_init, + .cmp = batadv_v_neigh_cmp, + .is_similar_or_better = batadv_v_neigh_is_sob, + .print = batadv_v_neigh_print, + }, + .orig = { + .print = batadv_v_orig_print, + }, }; /** diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 15cf2726d6a5..7d170010beb9 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -504,7 +504,7 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb, /* did we receive a B.A.T.M.A.N. V ELP packet on an interface * that does not have B.A.T.M.A.N. V ELP enabled ? */ - if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0) + if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0) return NET_RX_DROP; elp_packet = (struct batadv_elp_packet *)skb->data; diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 7ac9e0b30618..6fbba4eb0617 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -754,7 +754,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb, /* did we receive a OGM2 packet on an interface that does not have * B.A.T.M.A.N. V enabled ? */ - if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0) + if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0) return NET_RX_DROP; if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN)) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 70841c1e0069..1f9080840566 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -247,7 +247,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, if (!new_hard_iface) goto out; - bat_priv->bat_algo_ops->bat_primary_iface_set(new_hard_iface); + bat_priv->algo_ops->iface.primary_set(new_hard_iface); batadv_primary_if_update_addr(bat_priv, curr_hard_iface); out: @@ -394,7 +394,7 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) bat_priv = netdev_priv(hard_iface->soft_iface); - bat_priv->bat_algo_ops->bat_iface_update_mac(hard_iface); + bat_priv->algo_ops->iface.update_mac(hard_iface); hard_iface->if_status = BATADV_IF_TO_BE_ACTIVATED; /* the first active interface becomes our primary interface or @@ -409,8 +409,8 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) batadv_update_min_mtu(hard_iface->soft_iface); - if (bat_priv->bat_algo_ops->bat_iface_activate) - bat_priv->bat_algo_ops->bat_iface_activate(hard_iface); + if (bat_priv->algo_ops->iface.activate) + bat_priv->algo_ops->iface.activate(hard_iface); out: if (primary_if) @@ -508,7 +508,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, if (ret) goto err_dev; - ret = bat_priv->bat_algo_ops->bat_iface_enable(hard_iface); + ret = bat_priv->algo_ops->iface.enable(hard_iface); if (ret < 0) goto err_upper; @@ -517,7 +517,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->if_status = BATADV_IF_INACTIVE; ret = batadv_orig_hash_add_if(hard_iface, bat_priv->num_ifaces); if (ret < 0) { - bat_priv->bat_algo_ops->bat_iface_disable(hard_iface); + bat_priv->algo_ops->iface.disable(hard_iface); bat_priv->num_ifaces--; hard_iface->if_status = BATADV_IF_NOT_IN_USE; goto err_upper; @@ -598,7 +598,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_hardif_put(new_if); } - bat_priv->bat_algo_ops->bat_iface_disable(hard_iface); + bat_priv->algo_ops->iface.disable(hard_iface); hard_iface->if_status = BATADV_IF_NOT_IN_USE; /* delete all references to this hard_iface */ @@ -783,7 +783,7 @@ static int batadv_hard_if_event(struct notifier_block *this, batadv_check_known_mac_addr(hard_iface->net_dev); bat_priv = netdev_priv(hard_iface->soft_iface); - bat_priv->bat_algo_ops->bat_iface_update_mac(hard_iface); + bat_priv->algo_ops->iface.update_mac(hard_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index c25bbb8ab06c..231f8eaf075b 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -89,7 +89,7 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) || nla_put_string(msg, BATADV_ATTR_ALGO_NAME, - bat_priv->bat_algo_ops->name) || + bat_priv->algo_ops->name) || nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) || nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) || nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN, diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 8ad17ad477e4..7d1e5421f6bc 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -534,8 +534,8 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, kref_init(&hardif_neigh->refcount); - if (bat_priv->bat_algo_ops->bat_hardif_neigh_init) - bat_priv->bat_algo_ops->bat_hardif_neigh_init(hardif_neigh); + if (bat_priv->algo_ops->neigh.hardif_init) + bat_priv->algo_ops->neigh.hardif_init(hardif_neigh); hlist_add_head(&hardif_neigh->list, &hard_iface->neigh_list); @@ -706,17 +706,17 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n", BATADV_SOURCE_VERSION, primary_if->net_dev->name, primary_if->net_dev->dev_addr, net_dev->name, - bat_priv->bat_algo_ops->name); + bat_priv->algo_ops->name); batadv_hardif_put(primary_if); - if (!bat_priv->bat_algo_ops->bat_neigh_print) { + if (!bat_priv->algo_ops->neigh.print) { seq_puts(seq, "No printing function for this routing protocol\n"); return 0; } - bat_priv->bat_algo_ops->bat_neigh_print(bat_priv, seq); + bat_priv->algo_ops->neigh.print(bat_priv, seq); return 0; } @@ -767,8 +767,8 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) batadv_frag_purge_orig(orig_node, NULL); - if (orig_node->bat_priv->bat_algo_ops->bat_orig_free) - orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node); + if (orig_node->bat_priv->algo_ops->orig.free) + orig_node->bat_priv->algo_ops->orig.free(orig_node); kfree(orig_node->tt_buff); kfree(orig_node); @@ -1097,12 +1097,12 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *best = NULL, *neigh; - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + struct batadv_algo_ops *bao = bat_priv->algo_ops; rcu_read_lock(); hlist_for_each_entry_rcu(neigh, &orig_node->neigh_list, list) { - if (best && (bao->bat_neigh_cmp(neigh, if_outgoing, - best, if_outgoing) <= 0)) + if (best && (bao->neigh.cmp(neigh, if_outgoing, best, + if_outgoing) <= 0)) continue; if (!kref_get_unless_zero(&neigh->refcount)) @@ -1254,18 +1254,17 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n", BATADV_SOURCE_VERSION, primary_if->net_dev->name, primary_if->net_dev->dev_addr, net_dev->name, - bat_priv->bat_algo_ops->name); + bat_priv->algo_ops->name); batadv_hardif_put(primary_if); - if (!bat_priv->bat_algo_ops->bat_orig_print) { + if (!bat_priv->algo_ops->orig.print) { seq_puts(seq, "No printing function for this routing protocol\n"); return 0; } - bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq, - BATADV_IF_DEFAULT); + bat_priv->algo_ops->orig.print(bat_priv, seq, BATADV_IF_DEFAULT); return 0; } @@ -1292,7 +1291,7 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) } bat_priv = netdev_priv(hard_iface->soft_iface); - if (!bat_priv->bat_algo_ops->bat_orig_print) { + if (!bat_priv->algo_ops->orig.print) { seq_puts(seq, "No printing function for this routing protocol\n"); goto out; @@ -1306,9 +1305,9 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "[B.A.T.M.A.N. adv %s, IF/MAC: %s/%pM (%s %s)]\n", BATADV_SOURCE_VERSION, hard_iface->net_dev->name, hard_iface->net_dev->dev_addr, - hard_iface->soft_iface->name, bat_priv->bat_algo_ops->name); + hard_iface->soft_iface->name, bat_priv->algo_ops->name); - bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq, hard_iface); + bat_priv->algo_ops->orig.print(bat_priv, seq, hard_iface); out: if (hard_iface) @@ -1320,7 +1319,7 @@ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, int max_if_num) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + struct batadv_algo_ops *bao = bat_priv->algo_ops; struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; @@ -1336,9 +1335,8 @@ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { ret = 0; - if (bao->bat_orig_add_if) - ret = bao->bat_orig_add_if(orig_node, - max_if_num); + if (bao->orig.add_if) + ret = bao->orig.add_if(orig_node, max_if_num); if (ret == -ENOMEM) goto err; } @@ -1360,7 +1358,7 @@ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, struct hlist_head *head; struct batadv_hard_iface *hard_iface_tmp; struct batadv_orig_node *orig_node; - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + struct batadv_algo_ops *bao = bat_priv->algo_ops; u32 i; int ret; @@ -1373,10 +1371,9 @@ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { ret = 0; - if (bao->bat_orig_del_if) - ret = bao->bat_orig_del_if(orig_node, - max_if_num, - hard_iface->if_num); + if (bao->orig.del_if) + ret = bao->orig.del_if(orig_node, max_if_num, + hard_iface->if_num); if (ret == -ENOMEM) goto err; } diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 7b5de402ee0d..0f8c0ddf7816 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -482,7 +482,7 @@ batadv_find_router(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if) { - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + struct batadv_algo_ops *bao = bat_priv->algo_ops; struct batadv_neigh_node *first_candidate_router = NULL; struct batadv_neigh_node *next_candidate_router = NULL; struct batadv_neigh_node *router, *cand_router = NULL; @@ -536,9 +536,9 @@ batadv_find_router(struct batadv_priv *bat_priv, /* alternative candidate should be good enough to be * considered */ - if (!bao->bat_neigh_is_similar_or_better(cand_router, - cand->if_outgoing, - router, recv_if)) + if (!bao->neigh.is_similar_or_better(cand_router, + cand->if_outgoing, router, + recv_if)) goto next; /* don't use the same router twice */ diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 1a7942ddf730..fe9ca94ddee2 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -412,7 +412,7 @@ static ssize_t batadv_show_bat_algo(struct kobject *kobj, { struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); - return sprintf(buff, "%s\n", bat_priv->bat_algo_ops->name); + return sprintf(buff, "%s\n", bat_priv->algo_ops->name); } static void batadv_post_gw_reselect(struct net_device *net_dev) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 48ce7889a3e8..8bb82a391490 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1546,7 +1546,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry) { struct batadv_neigh_node *router, *best_router = NULL; - struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; + struct batadv_algo_ops *bao = bat_priv->algo_ops; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry, *best_entry = NULL; @@ -1558,8 +1558,8 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, continue; if (best_router && - bao->bat_neigh_cmp(router, BATADV_IF_DEFAULT, - best_router, BATADV_IF_DEFAULT) <= 0) { + bao->neigh.cmp(router, BATADV_IF_DEFAULT, best_router, + BATADV_IF_DEFAULT) <= 0) { batadv_neigh_node_put(router); continue; } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index a331e3ab93d1..4d6a7ce3f83c 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1016,7 +1016,7 @@ struct batadv_priv_bat_v { * @cleanup_work: work queue callback item for soft-interface deinit * @primary_if: one of the hard-interfaces assigned to this mesh interface * becomes the primary interface - * @bat_algo_ops: routing algorithm used by this mesh interface + * @algo_ops: routing algorithm used by this mesh interface * @softif_vlan_list: a list of softif_vlan structs, one per VLAN created on top * of the mesh interface represented by this object * @softif_vlan_list_lock: lock protecting softif_vlan_list @@ -1074,7 +1074,7 @@ struct batadv_priv { struct delayed_work orig_work; struct work_struct cleanup_work; struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */ - struct batadv_algo_ops *bat_algo_ops; + struct batadv_algo_ops *algo_ops; struct hlist_head softif_vlan_list; spinlock_t softif_vlan_list_lock; /* protects softif_vlan_list */ #ifdef CONFIG_BATMAN_ADV_BLA @@ -1387,60 +1387,78 @@ struct batadv_forw_packet { struct batadv_hard_iface *if_outgoing; }; +/** + * struct batadv_algo_iface_ops - mesh algorithm callbacks (interface specific) + * @activate: start routing mechanisms when hard-interface is brought up + * @enable: init routing info when hard-interface is enabled + * @disable: de-init routing info when hard-interface is disabled + * @update_mac: (re-)init mac addresses of the protocol information + * belonging to this hard-interface + * @primary_set: called when primary interface is selected / changed + */ +struct batadv_algo_iface_ops { + void (*activate)(struct batadv_hard_iface *hard_iface); + int (*enable)(struct batadv_hard_iface *hard_iface); + void (*disable)(struct batadv_hard_iface *hard_iface); + void (*update_mac)(struct batadv_hard_iface *hard_iface); + void (*primary_set)(struct batadv_hard_iface *hard_iface); +}; + +/** + * struct batadv_algo_neigh_ops - mesh algorithm callbacks (neighbour specific) + * @hardif_init: called on creation of single hop entry + * @cmp: compare the metrics of two neighbors for their respective outgoing + * interfaces + * @is_similar_or_better: check if neigh1 is equally similar or better than + * neigh2 for their respective outgoing interface from the metric prospective + * @print: print the single hop neighbor list (optional) + */ +struct batadv_algo_neigh_ops { + void (*hardif_init)(struct batadv_hardif_neigh_node *neigh); + int (*cmp)(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2); + bool (*is_similar_or_better)(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2); + void (*print)(struct batadv_priv *priv, struct seq_file *seq); +}; + +/** + * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific) + * @free: free the resources allocated by the routing algorithm for an orig_node + * object + * @add_if: ask the routing algorithm to apply the needed changes to the + * orig_node due to a new hard-interface being added into the mesh + * @del_if: ask the routing algorithm to apply the needed changes to the + * orig_node due to an hard-interface being removed from the mesh + * @print: print the originator table (optional) + */ +struct batadv_algo_orig_ops { + void (*free)(struct batadv_orig_node *orig_node); + int (*add_if)(struct batadv_orig_node *orig_node, int max_if_num); + int (*del_if)(struct batadv_orig_node *orig_node, int max_if_num, + int del_if_num); + void (*print)(struct batadv_priv *priv, struct seq_file *seq, + struct batadv_hard_iface *hard_iface); +}; + /** * struct batadv_algo_ops - mesh algorithm callbacks * @list: list node for the batadv_algo_list * @name: name of the algorithm - * @bat_iface_activate: start routing mechanisms when hard-interface is brought - * up - * @bat_iface_enable: init routing info when hard-interface is enabled - * @bat_iface_disable: de-init routing info when hard-interface is disabled - * @bat_iface_update_mac: (re-)init mac addresses of the protocol information - * belonging to this hard-interface - * @bat_primary_iface_set: called when primary interface is selected / changed - * @bat_hardif_neigh_init: called on creation of single hop entry - * @bat_neigh_cmp: compare the metrics of two neighbors for their respective - * outgoing interfaces - * @bat_neigh_is_similar_or_better: check if neigh1 is equally similar or - * better than neigh2 for their respective outgoing interface from the metric - * prospective - * @bat_neigh_print: print the single hop neighbor list (optional) - * @bat_orig_print: print the originator table (optional) - * @bat_orig_free: free the resources allocated by the routing algorithm for an - * orig_node object - * @bat_orig_add_if: ask the routing algorithm to apply the needed changes to - * the orig_node due to a new hard-interface being added into the mesh - * @bat_orig_del_if: ask the routing algorithm to apply the needed changes to - * the orig_node due to an hard-interface being removed from the mesh + * @iface: callbacks related to interface handling + * @neigh: callbacks related to neighbors handling + * @orig: callbacks related to originators handling */ struct batadv_algo_ops { struct hlist_node list; char *name; - void (*bat_iface_activate)(struct batadv_hard_iface *hard_iface); - int (*bat_iface_enable)(struct batadv_hard_iface *hard_iface); - void (*bat_iface_disable)(struct batadv_hard_iface *hard_iface); - void (*bat_iface_update_mac)(struct batadv_hard_iface *hard_iface); - void (*bat_primary_iface_set)(struct batadv_hard_iface *hard_iface); - /* neigh_node handling API */ - void (*bat_hardif_neigh_init)(struct batadv_hardif_neigh_node *neigh); - int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1, - struct batadv_hard_iface *if_outgoing1, - struct batadv_neigh_node *neigh2, - struct batadv_hard_iface *if_outgoing2); - bool (*bat_neigh_is_similar_or_better) - (struct batadv_neigh_node *neigh1, - struct batadv_hard_iface *if_outgoing1, - struct batadv_neigh_node *neigh2, - struct batadv_hard_iface *if_outgoing2); - void (*bat_neigh_print)(struct batadv_priv *priv, struct seq_file *seq); - /* orig_node handling API */ - void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq, - struct batadv_hard_iface *hard_iface); - void (*bat_orig_free)(struct batadv_orig_node *orig_node); - int (*bat_orig_add_if)(struct batadv_orig_node *orig_node, - int max_if_num); - int (*bat_orig_del_if)(struct batadv_orig_node *orig_node, - int max_if_num, int del_if_num); + struct batadv_algo_iface_ops iface; + struct batadv_algo_neigh_ops neigh; + struct batadv_algo_orig_ops orig; }; /** -- cgit v1.2.3 From 8b10cab64c134ffbffac96edd1899d303d3afcac Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 2 Jul 2016 06:43:14 -0400 Subject: net: simplify and make pkt_type_ok() available for other users Suggested-by: Daniel Borkmann Signed-off-by: Jamal Hadi Salim Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/netfilter/nft_meta.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 16c50b0dd426..03e5e33b5c39 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -199,13 +199,6 @@ err: } EXPORT_SYMBOL_GPL(nft_meta_get_eval); -/* don't change or set _LOOPBACK, _USER, etc. */ -static bool pkt_type_ok(u32 p) -{ - return p == PACKET_HOST || p == PACKET_BROADCAST || - p == PACKET_MULTICAST || p == PACKET_OTHERHOST; -} - void nft_meta_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -223,7 +216,7 @@ void nft_meta_set_eval(const struct nft_expr *expr, break; case NFT_META_PKTTYPE: if (skb->pkt_type != value && - pkt_type_ok(value) && pkt_type_ok(skb->pkt_type)) + skb_pkt_type_ok(value) && skb_pkt_type_ok(skb->pkt_type)) skb->pkt_type = value; break; case NFT_META_NFTRACE: -- cgit v1.2.3 From ff202ee1ed8f032f05b80b541664cf02e75d7080 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 2 Jul 2016 06:43:15 -0400 Subject: net sched actions: skbedit add support for mod-ing skb pkt_type Extremely useful for setting packet type to host so i dont have to modify the dst mac address using pedit (which requires that i know the mac address) Example usage: tc filter add dev eth0 parent ffff: protocol ip pref 9 u32 \ match ip src 5.5.5.5/32 \ flowid 1:5 action skbedit ptype host This will tag all packets incoming from 5.5.5.5 with type PACKET_HOST Signed-off-by: Jamal Hadi Salim Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/act_skbedit.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 53d1486cddf7..1c4c9240a3f8 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -47,6 +47,8 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, skb_set_queue_mapping(skb, d->queue_mapping); if (d->flags & SKBEDIT_F_MARK) skb->mark = d->mark; + if (d->flags & SKBEDIT_F_PTYPE) + skb->pkt_type = d->ptype; spin_unlock(&d->tcf_lock); return d->tcf_action; @@ -57,6 +59,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) }, [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, + [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, @@ -68,7 +71,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct tc_skbedit *parm; struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL; - u16 *queue_mapping = NULL; + u16 *queue_mapping = NULL, *ptype = NULL; bool exists = false; int ret = 0, err; @@ -92,6 +95,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]); } + if (tb[TCA_SKBEDIT_PTYPE] != NULL) { + ptype = nla_data(tb[TCA_SKBEDIT_PTYPE]); + if (!skb_pkt_type_ok(*ptype)) + return -EINVAL; + flags |= SKBEDIT_F_PTYPE; + } + if (tb[TCA_SKBEDIT_MARK] != NULL) { flags |= SKBEDIT_F_MARK; mark = nla_data(tb[TCA_SKBEDIT_MARK]); @@ -132,6 +142,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, d->queue_mapping = *queue_mapping; if (flags & SKBEDIT_F_MARK) d->mark = *mark; + if (flags & SKBEDIT_F_PTYPE) + d->ptype = *ptype; d->tcf_action = parm->action; @@ -169,6 +181,10 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), &d->mark)) goto nla_put_failure; + if ((d->flags & SKBEDIT_F_PTYPE) && + nla_put(skb, TCA_SKBEDIT_PTYPE, sizeof(d->ptype), + &d->ptype)) + goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) -- cgit v1.2.3 From 61cc535de36838bf4cfe08c8c4eeaad1ca4a89b1 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 2 Jul 2016 06:43:16 -0400 Subject: net sched actions: skbedit convert to use more modern nla_put_xxx Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_skbedit.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 1c4c9240a3f8..8e573c0f8742 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -170,20 +170,16 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if ((d->flags & SKBEDIT_F_PRIORITY) && - nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), - &d->priority)) + nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority)) goto nla_put_failure; if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && - nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING, - sizeof(d->queue_mapping), &d->queue_mapping)) + nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping)) goto nla_put_failure; if ((d->flags & SKBEDIT_F_MARK) && - nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), - &d->mark)) + nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark)) goto nla_put_failure; if ((d->flags & SKBEDIT_F_PTYPE) && - nla_put(skb, TCA_SKBEDIT_PTYPE, sizeof(d->ptype), - &d->ptype)) + nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype)) goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); -- cgit v1.2.3 From 0967f2445963b63269d7dd2f5b6f234ea57dd10e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 2 Jul 2016 14:12:54 -0700 Subject: net: pktgen: support injecting packets for qdisc testing Add another xmit_mode to pktgen to allow testing xmit functionality of qdiscs. The new mode "queue_xmit" injects packets at __dev_queue_xmit() so that qdisc is called. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/core/pktgen.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f74ab9c3b38f..bbd118b19aef 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -213,6 +213,7 @@ /* Xmit modes */ #define M_START_XMIT 0 /* Default normal TX */ #define M_NETIF_RECEIVE 1 /* Inject packets into stack */ +#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ /* If lock -- protects updating of if_list */ #define if_lock(t) spin_lock(&(t->if_lock)); @@ -626,6 +627,8 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) seq_puts(seq, " xmit_mode: netif_receive\n"); + else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) + seq_puts(seq, " xmit_mode: xmit_queue\n"); seq_puts(seq, " Flags: "); @@ -1142,8 +1145,10 @@ static ssize_t pktgen_if_write(struct file *file, return len; i += len; - if ((value > 1) && (pkt_dev->xmit_mode == M_START_XMIT) && - (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) + if ((value > 1) && + ((pkt_dev->xmit_mode == M_QUEUE_XMIT) || + ((pkt_dev->xmit_mode == M_START_XMIT) && + (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))))) return -ENOTSUPP; pkt_dev->burst = value < 1 ? 1 : value; sprintf(pg_result, "OK: burst=%d", pkt_dev->burst); @@ -1198,6 +1203,9 @@ static ssize_t pktgen_if_write(struct file *file, * at module loading time */ pkt_dev->clone_skb = 0; + } else if (strcmp(f, "queue_xmit") == 0) { + pkt_dev->xmit_mode = M_QUEUE_XMIT; + pkt_dev->last_ok = 1; } else { sprintf(pg_result, "xmit_mode -:%s:- unknown\nAvailable modes: %s", @@ -3434,6 +3442,36 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) #endif } while (--burst > 0); goto out; /* Skips xmit_mode M_START_XMIT */ + } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { + local_bh_disable(); + atomic_inc(&pkt_dev->skb->users); + + ret = dev_queue_xmit(pkt_dev->skb); + switch (ret) { + case NET_XMIT_SUCCESS: + pkt_dev->sofar++; + pkt_dev->seq_num++; + pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + break; + case NET_XMIT_DROP: + case NET_XMIT_CN: + /* These are all valid return codes for a qdisc but + * indicate packets are being dropped or will likely + * be dropped soon. + */ + case NETDEV_TX_BUSY: + /* qdisc may call dev_hard_start_xmit directly in cases + * where no queues exist e.g. loopback device, virtual + * devices, etc. In this case we need to handle + * NETDEV_TX_ codes. + */ + default: + pkt_dev->errors++; + net_info_ratelimited("%s xmit error: %d\n", + pkt_dev->odevname, ret); + break; + } + goto out; } txq = skb_get_tx_queue(odev, pkt_dev->skb); -- cgit v1.2.3 From 13c5c240f789bbd2bcacb14a23771491485ae61f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 3 Jul 2016 01:28:47 +0200 Subject: bpf: add bpf_get_hash_recalc helper If skb_clear_hash() was invoked due to mangling of relevant headers and BPF program needs skb->hash later on, we can add a helper to trigger hash recalculation via bpf_get_hash_recalc(). The helper will return the newly retrieved hash directly, but later access can also be done via skb context again through skb->hash directly (inline) without needing to call the helper once more. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 54071cf70fb5..10c4a2f9e8bb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1729,6 +1729,23 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* If skb_clear_hash() was called due to mangling, we can + * trigger SW recalculation here. Later access to hash + * can then use the inline skb->hash via context directly + * instead of calling this helper again. + */ + return skb_get_hash((struct sk_buff *) (unsigned long) r1); +} + +static const struct bpf_func_proto bpf_get_hash_recalc_proto = { + .func = bpf_get_hash_recalc, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; @@ -2337,6 +2354,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; case BPF_FUNC_perf_event_output: return bpf_get_event_output_proto(); case BPF_FUNC_get_smp_processor_id: -- cgit v1.2.3 From 7ce856aaaf13a5dc969ac5f998e5daaf1abe4cd2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 4 Jul 2016 08:23:12 +0200 Subject: mlxsw: spectrum: Add couple of lower device helper functions Add functions that iterate over lower devices and find port device. As a dependency add netdev_for_each_all_lower_dev and netdev_for_each_all_lower_dev_rcu macro with netdev_all_lower_get_next and netdev_all_lower_get_next_rcu shelpers. Also, add functions to return mlxsw struct according to lower device found and mlxsw_port struct with a reference to lower device. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/dev.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index aba10d2a8bc3..a4f3b0a9aeaf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5444,6 +5444,52 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) } EXPORT_SYMBOL(netdev_lower_get_next); +/** + * netdev_all_lower_get_next - Get the next device from all lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's all lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour all lower + * list will remain unchanged. + */ +struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->all_adj_list.lower) + return NULL; + + *iter = lower->list.next; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_all_lower_get_next); + +/** + * netdev_all_lower_get_next_rcu - Get the next device from all + * lower neighbour list, RCU variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's all lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_first_or_null_rcu(&dev->all_adj_list.lower, + struct netdev_adjacent, list); + + return lower ? lower->dev : NULL; +} +EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); + /** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU -- cgit v1.2.3 From c6ac37d8d8843fb1fdc34e4a2a41a4f027ab670c Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 1 Jul 2016 16:53:54 +0300 Subject: netfilter: nf_log: fix error on write NONE to logger choice sysctl It is hard to unbind nf-logger: echo NONE > /proc/sys/net/netfilter/nf_log/0 bash: echo: write error: No such file or directory sysctl -w net.netfilter.nf_log.0=NONE sysctl: setting key "net.netfilter.nf_log.0": No such file or directory net.netfilter.nf_log.0 = NONE You need explicitly send '\0', for instance like: echo -e "NONE\0" > /proc/sys/net/netfilter/nf_log/0 That seem to be strange, so fix it using proc_dostring. Now it works fine: modprobe nfnetlink_log echo nfnetlink_log > /proc/sys/net/netfilter/nf_log/0 cat /proc/sys/net/netfilter/nf_log/0 nfnetlink_log echo NONE > /proc/sys/net/netfilter/nf_log/0 cat /proc/sys/net/netfilter/nf_log/0 NONE v2: add missed error check for proc_dostring Signed-off-by: Pavel Tikhomirov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_log.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 18e325ce6542..aa5847a16713 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -418,16 +418,17 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write, { const struct nf_logger *logger; char buf[NFLOGGER_NAME_LEN]; - size_t size = *lenp; int r = 0; int tindex = (unsigned long)table->extra1; struct net *net = current->nsproxy->net_ns; if (write) { - if (size > sizeof(buf)) - size = sizeof(buf); - if (copy_from_user(buf, buffer, size)) - return -EFAULT; + struct ctl_table tmp = *table; + + tmp.data = buf; + r = proc_dostring(&tmp, write, buffer, lenp, ppos); + if (r) + return r; if (!strcmp(buf, "NONE")) { nf_log_unbind_pf(net, tindex); -- cgit v1.2.3 From 503eebc265dcf5c512454fd5a6b6673ea4f1d7f2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 Jul 2016 11:27:37 +0200 Subject: net: add dev arg to ndo_neigh_construct/destroy As the following patch will allow upper devices to follow the call down lower devices, we need to add dev here and not rely on n->dev. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/atm/clip.c | 2 +- net/core/neighbour.c | 4 ++-- net/ieee802154/6lowpan/core.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/atm/clip.c b/net/atm/clip.c index e07f551a863c..53b4ac09e7b7 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -286,7 +286,7 @@ static const struct neigh_ops clip_neigh_ops = { .connected_output = neigh_direct_output, }; -static int clip_constructor(struct neighbour *neigh) +static int clip_constructor(struct net_device *dev, struct neighbour *neigh) { struct atmarp_entry *entry = neighbour_priv(neigh); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 510cd62fcb99..952aabb5aa56 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -473,7 +473,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, } if (dev->netdev_ops->ndo_neigh_construct) { - error = dev->netdev_ops->ndo_neigh_construct(n); + error = dev->netdev_ops->ndo_neigh_construct(dev, n); if (error < 0) { rc = ERR_PTR(error); goto out_neigh_release; @@ -701,7 +701,7 @@ void neigh_destroy(struct neighbour *neigh) neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) - dev->netdev_ops->ndo_neigh_destroy(neigh); + dev->netdev_ops->ndo_neigh_destroy(dev, neigh); dev_put(dev); neigh_parms_put(neigh->parms); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 8c004a0c8d64..935ab932e841 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -81,7 +81,7 @@ static int lowpan_stop(struct net_device *dev) return 0; } -static int lowpan_neigh_construct(struct neighbour *n) +static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) { struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); -- cgit v1.2.3 From 18bfb924f0005a728caadd90ba755b2a660bf441 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 Jul 2016 11:27:38 +0200 Subject: net: introduce default neigh_construct/destroy ndo calls for L2 upper devices L2 upper device needs to propagate neigh_construct/destroy calls down to lower devices. Do this by defining default ndo functions and use them in team, bond, bridge and vlan. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 2 ++ net/bridge/br_device.c | 2 ++ net/core/dev.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 86ae75b77390..c8f422c90856 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -790,6 +790,8 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, + .ndo_neigh_construct = netdev_default_l2upper_neigh_construct, + .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy, .ndo_fdb_add = switchdev_port_fdb_add, .ndo_fdb_del = switchdev_port_fdb_del, .ndo_fdb_dump = switchdev_port_fdb_dump, diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0c39e0f6da09..8eecd0ec22f2 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -349,6 +349,8 @@ static const struct net_device_ops br_netdev_ops = { .ndo_add_slave = br_add_slave, .ndo_del_slave = br_del_slave, .ndo_fix_features = br_fix_features, + .ndo_neigh_construct = netdev_default_l2upper_neigh_construct, + .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy, .ndo_fdb_add = br_fdb_add, .ndo_fdb_del = br_fdb_delete, .ndo_fdb_dump = br_fdb_dump, diff --git a/net/core/dev.c b/net/core/dev.c index a4f3b0a9aeaf..b92d63bfde7a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6087,6 +6087,50 @@ void netdev_lower_state_changed(struct net_device *lower_dev, } EXPORT_SYMBOL(netdev_lower_state_changed); +int netdev_default_l2upper_neigh_construct(struct net_device *dev, + struct neighbour *n) +{ + struct net_device *lower_dev, *stop_dev; + struct list_head *iter; + int err; + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (!lower_dev->netdev_ops->ndo_neigh_construct) + continue; + err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n); + if (err) { + stop_dev = lower_dev; + goto rollback; + } + } + return 0; + +rollback: + netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (lower_dev == stop_dev) + break; + if (!lower_dev->netdev_ops->ndo_neigh_destroy) + continue; + lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); + } + return err; +} +EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct); + +void netdev_default_l2upper_neigh_destroy(struct net_device *dev, + struct neighbour *n) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (!lower_dev->netdev_ops->ndo_neigh_destroy) + continue; + lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); + } +} +EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; -- cgit v1.2.3 From 2a4501ae18b52fcdf553404286e6cefabd1d17ec Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 5 Jul 2016 11:27:42 +0200 Subject: neigh: Send a notification when DELAY_PROBE_TIME changes When the data plane is offloaded the traffic doesn't go through the networking stack. Therefore, after first resolving a neighbour the NUD state machine will transition it from REACHABLE to STALE until it's finally deleted by the garbage collector. To prevent such situations the offloading driver should notify the NUD state machine on any neighbours that were recently used. The driver's polling interval should be set so that the NUD state machine can function as if the traffic wasn't offloaded. Currently, there are no in-tree drivers that can report confirmation for a neighbour, but only 'used' indication. Therefore, the polling interval should be set according to DELAY_FIRST_PROBE_TIME, as a neighbour will transition from REACHABLE state to DELAY (instead of STALE) if "a packet was sent within the last DELAY_FIRST_PROBE_TIME seconds" (RFC 4861). Send a netevent whenever the DELAY_FIRST_PROBE_TIME changes - either via netlink or sysctl - so that offloading drivers can correctly set their polling interval. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 952aabb5aa56..5cdc62a8eb84 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2047,6 +2047,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) case NDTPA_DELAY_PROBE_TIME: NEIGH_VAR_SET(p, DELAY_PROBE_TIME, nla_get_msecs(tbp[i])); + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, @@ -2930,6 +2931,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) return; set_bit(index, p->data_state); + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } -- cgit v1.2.3 From de9e5aeb4f40e72fa3bb087d378c9bd4ecf65c7f Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Wed, 29 Jun 2016 10:48:22 +0200 Subject: NFC: llcp: Fix usage of llcp_add_tlv() In functions using llcp_add_tlv(), a skb pointer could be set to NULL and then reuse afterward. With this patch, the skb pointer returned by llcp_add_tlv() is ignored since it can only be the passed skb pointer or NULL when the passed TLV is NULL. There is also no need to check for the TLV pointer as this is done by llcp_add_tlv(). Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/llcp_commands.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 3425532c39f7..4112d009b032 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -438,12 +438,9 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) goto error_tlv; } - if (service_name_tlv != NULL) - skb = llcp_add_tlv(skb, service_name_tlv, - service_name_tlv_length); - - skb = llcp_add_tlv(skb, miux_tlv, miux_tlv_length); - skb = llcp_add_tlv(skb, rw_tlv, rw_tlv_length); + llcp_add_tlv(skb, service_name_tlv, service_name_tlv_length); + llcp_add_tlv(skb, miux_tlv, miux_tlv_length); + llcp_add_tlv(skb, rw_tlv, rw_tlv_length); skb_queue_tail(&local->tx_queue, skb); @@ -493,8 +490,8 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) goto error_tlv; } - skb = llcp_add_tlv(skb, miux_tlv, miux_tlv_length); - skb = llcp_add_tlv(skb, rw_tlv, rw_tlv_length); + llcp_add_tlv(skb, miux_tlv, miux_tlv_length); + llcp_add_tlv(skb, rw_tlv, rw_tlv_length); skb_queue_tail(&local->tx_queue, skb); -- cgit v1.2.3 From 256f3ee3d1468660ca3b10ad3beab7e8f6cbd969 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Wed, 29 Jun 2016 10:48:23 +0200 Subject: NFC: llcp: Fix 2 memory leaks Once copied into the sk_buff data area using llcp_add_tlv(), the allocated TLVs must be freed. With this patch nfc_llcp_send_connect() and nfc_llcp_send_cc() don't return immediately on success and now free the allocated TLVs. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/llcp_commands.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 4112d009b032..c5959ce503e6 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -444,10 +444,11 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) skb_queue_tail(&local->tx_queue, skb); - return 0; + err = 0; error_tlv: - pr_err("error %d\n", err); + if (err) + pr_err("error %d\n", err); kfree(service_name_tlv); kfree(miux_tlv); @@ -495,10 +496,11 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) skb_queue_tail(&local->tx_queue, skb); - return 0; + err = 0; error_tlv: - pr_err("error %d\n", err); + if (err) + pr_err("error %d\n", err); kfree(miux_tlv); kfree(rw_tlv); -- cgit v1.2.3 From b77693447db987e77a39afaa8774e8702cb110d5 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Thu, 16 Jun 2016 20:24:41 +0200 Subject: NFC: digital: Fix a memory leak in NFC-F listening mode When configured as a target listening for a SENSF_REQ poll command, a nfcid2 array was allocated for no reason leading to a memory leak. The nfcid2 is sent by the target in the SENSF_RES reply. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_technology.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net') diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c index fb58ed2dd41d..d9080dec5d27 100644 --- a/net/nfc/digital_technology.c +++ b/net/nfc/digital_technology.c @@ -1257,21 +1257,12 @@ static int digital_tg_config_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech) int digital_tg_listen_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech) { int rc; - u8 *nfcid2; rc = digital_tg_config_nfcf(ddev, rf_tech); if (rc) return rc; - nfcid2 = kzalloc(NFC_NFCID2_MAXSIZE, GFP_KERNEL); - if (!nfcid2) - return -ENOMEM; - - nfcid2[0] = DIGITAL_SENSF_NFCID2_NFC_DEP_B1; - nfcid2[1] = DIGITAL_SENSF_NFCID2_NFC_DEP_B2; - get_random_bytes(nfcid2 + 2, NFC_NFCID2_MAXSIZE - 2); - - return digital_tg_listen(ddev, 300, digital_tg_recv_sensf_req, nfcid2); + return digital_tg_listen(ddev, 300, digital_tg_recv_sensf_req, NULL); } void digital_tg_recv_md_req(struct nfc_digital_dev *ddev, void *arg, -- cgit v1.2.3 From 3f89fea35fc37b326d6b3697fcc9cba235a60811 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Thu, 16 Jun 2016 20:24:42 +0200 Subject: NFC: digital: Rework error handling in DEP_RES response The Digital Protocol stack used to send a NACK frame whatever the error type it receives in digital_in_recv_dep_res(). It actually should only send a NACK frame on CRC or parity check errors or on any transmission error if a NACK frame was previously sent. Existing drivers used to send EIO error for this kind of issues so this patch limits sending of NACK frames on EIO errors. All other errors will be reported to the upper layers. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_dep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index f72be7433df3..b62c85dc12a2 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -664,7 +664,7 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, rc = PTR_ERR(resp); resp = NULL; - if (((rc != -ETIMEDOUT) || ddev->nack_count) && + if ((rc == -EIO || (rc == -ETIMEDOUT && ddev->nack_count)) && (ddev->nack_count++ < DIGITAL_NFC_DEP_N_RETRY_NACK)) { ddev->atn_count = 0; -- cgit v1.2.3 From 82e57952869fbbdf09d8f9e7ac284df13741e93d Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Thu, 16 Jun 2016 20:24:43 +0200 Subject: NFC: digital: Call pending command callbacks at device unregister With this patch, when freeing the command queue in the module unregister function, the callbacks of the commands still queued are called with a ENODEV error. This gives a chance to the command issuer to free any memory it could have allocate. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index 27769ac89d27..6e0b255aec66 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -842,6 +842,14 @@ void nfc_digital_unregister_device(struct nfc_digital_dev *ddev) list_for_each_entry_safe(cmd, n, &ddev->cmd_queue, queue) { list_del(&cmd->queue); + + /* Call the command callback if any and pass it a ENODEV error. + * This gives a chance to the command issuer to free any + * allocated buffer. + */ + if (cmd->cmd_cb) + cmd->cmd_cb(ddev, cmd->cb_context, ERR_PTR(-ENODEV)); + kfree(cmd->mdaa_params); kfree(cmd); } -- cgit v1.2.3 From af66df0f53b9120437556d8eb00d70a36e791258 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Thu, 16 Jun 2016 20:24:44 +0200 Subject: NFC: digital: Set the command pending flag There is a flag in the command structure indicating that this command is pending. It was checked before sending the command to not send the same command twice but it was actually never set. This is now fixed. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index 6e0b255aec66..0146e42ee28f 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -176,6 +176,8 @@ static void digital_wq_cmd(struct work_struct *work) return; } + cmd->pending = 1; + mutex_unlock(&ddev->cmd_lock); if (cmd->req) -- cgit v1.2.3 From 3cc952dbf1a7176b9247da4cd2612c9ddc1d1b51 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Thu, 16 Jun 2016 20:24:45 +0200 Subject: NFC: digital: Abort last command when dep link goes down With this patch, the Digital Protocol layer abort the last issued command when the dep link goes down. That way it does not have to wait for the driver to reply with a timeout error before sending a new command (i.e. a start poll command if constant polling is on). Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index 0146e42ee28f..0fd5518bf252 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -612,6 +612,8 @@ static int digital_dep_link_down(struct nfc_dev *nfc_dev) { struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev); + digital_abort_cmd(ddev); + ddev->curr_protocol = 0; return 0; -- cgit v1.2.3 From 88b99d0b7af859bbd97e76d66527f107843340a5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 1 Jul 2016 08:27:42 +0100 Subject: rxrpc: Fix some sparse errors Fix the following sparse errors: ../net/rxrpc/conn_object.c:77:17: warning: incorrect type in assignment (different base types) ../net/rxrpc/conn_object.c:77:17: expected restricted __be32 [usertype] call_id ../net/rxrpc/conn_object.c:77:17: got unsigned int [unsigned] [usertype] call_id ../net/rxrpc/conn_object.c:84:21: warning: restricted __be32 degrades to integer ../net/rxrpc/conn_object.c:86:26: warning: restricted __be32 degrades to integer ../net/rxrpc/conn_object.c:357:15: warning: incorrect type in assignment (different base types) ../net/rxrpc/conn_object.c:357:15: expected restricted __be32 [usertype] epoch ../net/rxrpc/conn_object.c:357:15: got unsigned int [unsigned] [usertype] epoch ../net/rxrpc/conn_object.c:369:21: warning: restricted __be32 degrades to integer ../net/rxrpc/conn_object.c:371:26: warning: restricted __be32 degrades to integer ../net/rxrpc/conn_object.c:411:21: warning: restricted __be32 degrades to integer ../net/rxrpc/conn_object.c:413:26: warning: restricted __be32 degrades to integer Signed-off-by: David Howells --- net/rxrpc/conn_object.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 4bfad7cf96cb..c86a3cff7585 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -70,7 +70,7 @@ static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, { struct rxrpc_call *xcall; struct rb_node *parent, **p; - __be32 call_id; + u32 call_id; write_lock_bh(&conn->lock); @@ -347,8 +347,7 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rb_node *p, **pp; const char *new = "old"; - __be32 epoch; - u32 cid; + u32 epoch, cid; _enter(""); -- cgit v1.2.3 From 689f4c646d6a8f0730eec11e06e5909de0b5d5d2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 Jun 2016 11:34:30 +0100 Subject: rxrpc: Check the source of a packet to a client conn When looking up a client connection to which to route a packet, we need to check that the packet came from the correct source so that a peer can't try to muck around with another peer's connection. Signed-off-by: David Howells --- net/rxrpc/conn_object.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index c86a3cff7585..2c2456ff2853 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -508,7 +508,9 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *local, } } else { conn = idr_find(&rxrpc_client_conn_ids, cid >> RXRPC_CIDSHIFT); - if (conn && conn->proto.epoch == epoch) + if (conn && + conn->proto.epoch == epoch && + conn->params.peer == peer) goto found; } -- cgit v1.2.3 From a263629da519b2064588377416e067727e2cbdf9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 26 Jun 2016 14:55:24 -0700 Subject: rxrpc: Avoid using stack memory in SG lists in rxkad rxkad uses stack memory in SG lists which would not work if stacks were allocated from vmalloc memory. In fact, in most cases this isn't even necessary as the stack memory ends up getting copied over to kmalloc memory. This patch eliminates all the unnecessary stack memory uses by supplying the final destination directly to the crypto API. In two instances where a temporary buffer is actually needed we also switch use a scratch area in the rxrpc_call struct (only one DATA packet will be being secured or verified at a time). Finally there is no need to split a split-page buffer into two SG entries so code dealing with that has been removed. Signed-off-by: Herbert Xu Signed-off-by: Andy Lutomirski Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 8 +-- net/rxrpc/conn_event.c | 5 +- net/rxrpc/conn_object.c | 6 +- net/rxrpc/insecure.c | 7 ++- net/rxrpc/rxkad.c | 150 ++++++++++++++++-------------------------------- 5 files changed, 65 insertions(+), 111 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 702db72196fb..796368d1fb25 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -141,17 +141,16 @@ struct rxrpc_security { int (*init_connection_security)(struct rxrpc_connection *); /* prime a connection's packet security */ - void (*prime_packet_security)(struct rxrpc_connection *); + int (*prime_packet_security)(struct rxrpc_connection *); /* impose security on a packet */ - int (*secure_packet)(const struct rxrpc_call *, + int (*secure_packet)(struct rxrpc_call *, struct sk_buff *, size_t, void *); /* verify the security on a received packet */ - int (*verify_packet)(const struct rxrpc_call *, struct sk_buff *, - u32 *); + int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, u32 *); /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); @@ -399,6 +398,7 @@ struct rxrpc_call { struct sk_buff_head rx_oos_queue; /* packets received out of sequence */ struct sk_buff *tx_pending; /* Tx socket buffer being filled */ wait_queue_head_t tx_waitq; /* wait for Tx window space to become available */ + __be32 crypto_buf[2]; /* Temporary packet crypto buffer */ unsigned long user_call_ID; /* user-defined call ID */ unsigned long creation_jif; /* time of call creation */ unsigned long flags; diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index bf6971555eac..6a3c96707831 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -188,7 +188,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - conn->security->prime_packet_security(conn); + ret = conn->security->prime_packet_security(conn); + if (ret < 0) + return ret; + read_lock_bh(&conn->lock); spin_lock(&conn->state_lock); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 2c2456ff2853..c2c0926af546 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -138,7 +138,9 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) if (ret < 0) goto error_1; - conn->security->prime_packet_security(conn); + ret = conn->security->prime_packet_security(conn); + if (ret < 0) + goto error_2; write_lock(&rxrpc_connection_lock); list_add_tail(&conn->link, &rxrpc_connections); @@ -152,6 +154,8 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) _leave(" = %p", conn); return conn; +error_2: + conn->security->clear(conn); error_1: rxrpc_put_client_connection_id(conn); error_0: diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index e571403613c1..c21ad213b337 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -17,11 +17,12 @@ static int none_init_connection_security(struct rxrpc_connection *conn) return 0; } -static void none_prime_packet_security(struct rxrpc_connection *conn) +static int none_prime_packet_security(struct rxrpc_connection *conn) { + return 0; } -static int none_secure_packet(const struct rxrpc_call *call, +static int none_secure_packet(struct rxrpc_call *call, struct sk_buff *skb, size_t data_size, void *sechdr) @@ -29,7 +30,7 @@ static int none_secure_packet(const struct rxrpc_call *call, return 0; } -static int none_verify_packet(const struct rxrpc_call *call, +static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, u32 *_abort_code) { diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 23c05ec6fa28..3acc7c1241d4 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -103,43 +103,43 @@ error: * prime the encryption state with the invariant parts of a connection's * description */ -static void rxkad_prime_packet_security(struct rxrpc_connection *conn) +static int rxkad_prime_packet_security(struct rxrpc_connection *conn) { struct rxrpc_key_token *token; SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); - struct scatterlist sg[2]; + struct scatterlist sg; struct rxrpc_crypt iv; - struct { - __be32 x[4]; - } tmpbuf __attribute__((aligned(16))); /* must all be in same page */ + __be32 *tmpbuf; + size_t tmpsize = 4 * sizeof(__be32); _enter(""); if (!conn->params.key) - return; + return 0; + + tmpbuf = kmalloc(tmpsize, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; token = conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - tmpbuf.x[0] = htonl(conn->proto.epoch); - tmpbuf.x[1] = htonl(conn->proto.cid); - tmpbuf.x[2] = 0; - tmpbuf.x[3] = htonl(conn->security_ix); - - sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); - sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); + tmpbuf[0] = htonl(conn->proto.epoch); + tmpbuf[1] = htonl(conn->proto.cid); + tmpbuf[2] = 0; + tmpbuf[3] = htonl(conn->security_ix); + sg_init_one(&sg, tmpbuf, tmpsize); skcipher_request_set_tfm(req, conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); - + skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x); crypto_skcipher_encrypt(req); skcipher_request_zero(req); - memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv)); - ASSERTCMP((u32 __force)conn->csum_iv.n[0], ==, (u32 __force)tmpbuf.x[2]); - - _leave(""); + memcpy(&conn->csum_iv, tmpbuf + 2, sizeof(conn->csum_iv)); + kfree(tmpbuf); + _leave(" = 0"); + return 0; } /* @@ -152,12 +152,9 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, { struct rxrpc_skb_priv *sp; SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); + struct rxkad_level1_hdr hdr; struct rxrpc_crypt iv; - struct scatterlist sg[2]; - struct { - struct rxkad_level1_hdr hdr; - __be32 first; /* first four bytes of data and padding */ - } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + struct scatterlist sg; u16 check; sp = rxrpc_skb(skb); @@ -167,24 +164,19 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, check = sp->hdr.seq ^ sp->hdr.callNumber; data_size |= (u32)check << 16; - tmpbuf.hdr.data_size = htonl(data_size); - memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first)); + hdr.data_size = htonl(data_size); + memcpy(sechdr, &hdr, sizeof(hdr)); /* start the encryption afresh */ memset(&iv, 0, sizeof(iv)); - sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); - sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); - + sg_init_one(&sg, sechdr, 8); skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); - + skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); skcipher_request_zero(req); - memcpy(sechdr, &tmpbuf, sizeof(tmpbuf)); - _leave(" = 0"); return 0; } @@ -198,8 +190,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, void *sechdr) { const struct rxrpc_key_token *token; - struct rxkad_level2_hdr rxkhdr - __attribute__((aligned(8))); /* must be all on one page */ + struct rxkad_level2_hdr rxkhdr; struct rxrpc_skb_priv *sp; SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; @@ -218,18 +209,16 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, rxkhdr.data_size = htonl(data_size | (u32)check << 16); rxkhdr.checksum = 0; + memcpy(sechdr, &rxkhdr, sizeof(rxkhdr)); /* encrypt from the session key */ token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); sg_init_one(&sg[0], sechdr, sizeof(rxkhdr)); - sg_init_one(&sg[1], &rxkhdr, sizeof(rxkhdr)); - skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(rxkhdr), iv.x); - + skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x); crypto_skcipher_encrypt(req); /* we want to encrypt the skbuff in-place */ @@ -243,9 +232,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, sg_init_table(sg, nsg); skb_to_sgvec(skb, sg, 0, len); - skcipher_request_set_crypt(req, sg, sg, len, iv.x); - crypto_skcipher_encrypt(req); _leave(" = 0"); @@ -259,7 +246,7 @@ out: /* * checksum an RxRPC packet header */ -static int rxkad_secure_packet(const struct rxrpc_call *call, +static int rxkad_secure_packet(struct rxrpc_call *call, struct sk_buff *skb, size_t data_size, void *sechdr) @@ -267,10 +254,7 @@ static int rxkad_secure_packet(const struct rxrpc_call *call, struct rxrpc_skb_priv *sp; SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; - struct scatterlist sg[2]; - struct { - __be32 x[2]; - } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + struct scatterlist sg; u32 x, y; int ret; @@ -293,20 +277,17 @@ static int rxkad_secure_packet(const struct rxrpc_call *call, /* calculate the security checksum */ x = call->channel << (32 - RXRPC_CIDSHIFT); x |= sp->hdr.seq & 0x3fffffff; - tmpbuf.x[0] = htonl(sp->hdr.callNumber); - tmpbuf.x[1] = htonl(x); - - sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); - sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); + call->crypto_buf[0] = htonl(sp->hdr.callNumber); + call->crypto_buf[1] = htonl(x); + sg_init_one(&sg, call->crypto_buf, 8); skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); - + skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); skcipher_request_zero(req); - y = ntohl(tmpbuf.x[1]); + y = ntohl(call->crypto_buf[1]); y = (y >> 16) & 0xffff; if (y == 0) y = 1; /* zero checksums are not permitted */ @@ -367,7 +348,6 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call, skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, 8, iv.x); - crypto_skcipher_decrypt(req); skcipher_request_zero(req); @@ -452,7 +432,6 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x); - crypto_skcipher_decrypt(req); skcipher_request_zero(req); if (sg != _sg) @@ -498,17 +477,14 @@ nomem: /* * verify the security on a received packet */ -static int rxkad_verify_packet(const struct rxrpc_call *call, +static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, u32 *_abort_code) { SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_skb_priv *sp; struct rxrpc_crypt iv; - struct scatterlist sg[2]; - struct { - __be32 x[2]; - } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + struct scatterlist sg; u16 cksum; u32 x, y; int ret; @@ -533,20 +509,17 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, /* validate the security checksum */ x = call->channel << (32 - RXRPC_CIDSHIFT); x |= sp->hdr.seq & 0x3fffffff; - tmpbuf.x[0] = htonl(call->call_id); - tmpbuf.x[1] = htonl(x); - - sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); - sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); + call->crypto_buf[0] = htonl(call->call_id); + call->crypto_buf[1] = htonl(x); + sg_init_one(&sg, call->crypto_buf, 8); skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x); - + skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); skcipher_request_zero(req); - y = ntohl(tmpbuf.x[1]); + y = ntohl(call->crypto_buf[1]); cksum = (y >> 16) & 0xffff; if (cksum == 0) cksum = 1; /* zero checksums are not permitted */ @@ -709,29 +682,6 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response) response->encrypted.checksum = htonl(csum); } -/* - * load a scatterlist with a potentially split-page buffer - */ -static void rxkad_sg_set_buf2(struct scatterlist sg[2], - void *buf, size_t buflen) -{ - int nsg = 1; - - sg_init_table(sg, 2); - - sg_set_buf(&sg[0], buf, buflen); - if (sg[0].offset + buflen > PAGE_SIZE) { - /* the buffer was split over two pages */ - sg[0].length = PAGE_SIZE - sg[0].offset; - sg_set_buf(&sg[1], buf + sg[0].length, buflen - sg[0].length); - nsg++; - } - - sg_mark_end(&sg[nsg - 1]); - - ASSERTCMP(sg[0].length + sg[1].length, ==, buflen); -} - /* * encrypt the response packet */ @@ -741,17 +691,16 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn, { SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); struct rxrpc_crypt iv; - struct scatterlist sg[2]; + struct scatterlist sg[1]; /* continue encrypting from where we left off */ memcpy(&iv, s2->session_key, sizeof(iv)); - rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted)); - + sg_init_table(sg, 1); + sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); skcipher_request_set_tfm(req, conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); - crypto_skcipher_encrypt(req); skcipher_request_zero(req); } @@ -887,10 +836,8 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, } sg_init_one(&sg[0], ticket, ticket_len); - skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, ticket_len, iv.x); - crypto_skcipher_decrypt(req); skcipher_request_free(req); @@ -1001,7 +948,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, const struct rxrpc_crypt *session_key) { SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci); - struct scatterlist sg[2]; + struct scatterlist sg[1]; struct rxrpc_crypt iv; _enter(",,%08x%08x", @@ -1016,12 +963,11 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, memcpy(&iv, session_key, sizeof(iv)); - rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted)); - + sg_init_table(sg, 1); + sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); skcipher_request_set_tfm(req, rxkad_ci); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); - crypto_skcipher_decrypt(req); skcipher_request_zero(req); -- cgit v1.2.3 From 5acbee4648789ba1fe9e7942280fb1966c76bd6f Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Jun 2016 10:32:02 +0100 Subject: rxrpc: Provide queuing helper functions Provide queueing helper functions so that the queueing of local and connection objects can be fixed later. The issue is that a ref on the object needs to be passed to the work queue, but the act of queueing the object may fail because the object is already queued. Testing the queuedness of an object before hand doesn't work because there can be a race with someone else trying to queue it. What will have to be done is to adjust the refcount depending on the result of the queue operation. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 12 +++++++++++- net/rxrpc/conn_event.c | 2 +- net/rxrpc/input.c | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 796368d1fb25..45aef3ef7609 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -35,7 +35,6 @@ struct rxrpc_crypt { queue_delayed_work(rxrpc_workqueue, (WS), (D)) #define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor) -#define rxrpc_queue_conn(CONN) rxrpc_queue_work(&(CONN)->processor) struct rxrpc_connection; @@ -566,6 +565,12 @@ static inline void rxrpc_get_connection(struct rxrpc_connection *conn) atomic_inc(&conn->usage); } + +static inline void rxrpc_queue_conn(struct rxrpc_connection *conn) +{ + rxrpc_queue_work(&conn->processor); +} + /* * input.c */ @@ -618,6 +623,11 @@ static inline void rxrpc_put_local(struct rxrpc_local *local) __rxrpc_put_local(local); } +static inline void rxrpc_queue_local(struct rxrpc_local *local) +{ + rxrpc_queue_work(&local->processor); +} + /* * misc.c */ diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 6a3c96707831..d7e183c6b5df 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -318,7 +318,7 @@ void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) CHECK_SLAB_OKAY(&local->usage); skb_queue_tail(&local->reject_queue, skb); - rxrpc_queue_work(&local->processor); + rxrpc_queue_local(local); } /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 5f26cae43069..fe7ff339d7e5 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -595,7 +595,7 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, _enter("%p,%p", local, skb); skb_queue_tail(&local->event_queue, skb); - rxrpc_queue_work(&local->processor); + rxrpc_queue_local(local); } /* -- cgit v1.2.3 From bba304db34ec3ca0d13e7f48e5a4e9896536cacc Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Jun 2016 10:32:02 +0100 Subject: rxrpc: Turn connection #defines into enums and put outside struct def Turn the connection event and state #define lists into enums and move outside of the struct definition. Whilst we're at it, change _SERVER to _SERVICE in those identifiers and add EV_ into the event name to distinguish them from flags and states. Also add a symbol indicating the number of states and use that in the state text array. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 42 ++++++++++++++++++++++++++++++------------ net/rxrpc/call_accept.c | 6 +++--- net/rxrpc/conn_event.c | 6 +++--- net/rxrpc/conn_object.c | 4 ++-- net/rxrpc/proc.c | 18 +++++++++--------- 5 files changed, 47 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 45aef3ef7609..3f0d0479a4da 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -253,6 +253,35 @@ struct rxrpc_conn_parameters { u32 security_level; /* Security level selected */ }; +/* + * Bits in the connection flags. + */ +enum rxrpc_conn_flag { + RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */ +}; + +/* + * Events that can be raised upon a connection. + */ +enum rxrpc_conn_event { + RXRPC_CONN_EV_CHALLENGE, /* Send challenge packet */ +}; + +/* + * The connection protocol state. + */ +enum rxrpc_conn_proto_state { + RXRPC_CONN_UNUSED, /* Connection not yet attempted */ + RXRPC_CONN_CLIENT, /* Client connection */ + RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */ + RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */ + RXRPC_CONN_SERVICE, /* Service secured connection */ + RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */ + RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */ + RXRPC_CONN_NETWORK_ERROR, /* Conn terminated by network error */ + RXRPC_CONN__NR_STATES +}; + /* * RxRPC connection definition * - matched by { local, peer, epoch, conn_id, direction } @@ -279,23 +308,12 @@ struct rxrpc_connection { struct crypto_skcipher *cipher; /* encryption handle */ struct rxrpc_crypt csum_iv; /* packet checksum base */ unsigned long flags; -#define RXRPC_CONN_HAS_IDR 0 /* - Has a client conn ID assigned */ unsigned long events; -#define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */ unsigned long put_time; /* Time at which last put */ rwlock_t lock; /* access lock */ spinlock_t state_lock; /* state-change lock */ atomic_t usage; - enum { /* current state of connection */ - RXRPC_CONN_UNUSED, /* - connection not yet attempted */ - RXRPC_CONN_CLIENT, /* - client connection */ - RXRPC_CONN_SERVER_UNSECURED, /* - server unsecured connection */ - RXRPC_CONN_SERVER_CHALLENGING, /* - server challenging for security */ - RXRPC_CONN_SERVER, /* - server secured connection */ - RXRPC_CONN_REMOTELY_ABORTED, /* - conn aborted by peer */ - RXRPC_CONN_LOCALLY_ABORTED, /* - conn aborted locally */ - RXRPC_CONN_NETWORK_ERROR, /* - conn terminated by network error */ - } state; + enum rxrpc_conn_proto_state state : 8; /* current state of connection */ u32 local_abort; /* local abort code */ u32 remote_abort; /* remote abort code */ int error; /* local error incurred */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 202e053a3c6d..1c0860df150e 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -128,12 +128,12 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, spin_lock(&call->conn->state_lock); if (sp->hdr.securityIndex > 0 && - call->conn->state == RXRPC_CONN_SERVER_UNSECURED) { + call->conn->state == RXRPC_CONN_SERVICE_UNSECURED) { _debug("await conn sec"); list_add_tail(&call->accept_link, &rx->secureq); - call->conn->state = RXRPC_CONN_SERVER_CHALLENGING; + call->conn->state = RXRPC_CONN_SERVICE_CHALLENGING; rxrpc_get_connection(call->conn); - set_bit(RXRPC_CONN_CHALLENGE, &call->conn->events); + set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); rxrpc_queue_conn(call->conn); } else { _debug("conn ready"); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index d7e183c6b5df..b9c39b83eddb 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -195,8 +195,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, read_lock_bh(&conn->lock); spin_lock(&conn->state_lock); - if (conn->state == RXRPC_CONN_SERVER_CHALLENGING) { - conn->state = RXRPC_CONN_SERVER; + if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { + conn->state = RXRPC_CONN_SERVICE; for (loop = 0; loop < RXRPC_MAXCALLS; loop++) rxrpc_call_is_secure(conn->channels[loop]); } @@ -268,7 +268,7 @@ void rxrpc_process_connection(struct work_struct *work) rxrpc_get_connection(conn); - if (test_and_clear_bit(RXRPC_CONN_CHALLENGE, &conn->events)) { + if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) { rxrpc_secure_connection(conn); rxrpc_put_connection(conn); } diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index c2c0926af546..0e022dfab034 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -399,9 +399,9 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, candidate->params.service_id = sp->hdr.serviceId; candidate->security_ix = sp->hdr.securityIndex; candidate->out_clientflag = 0; - candidate->state = RXRPC_CONN_SERVER; + candidate->state = RXRPC_CONN_SERVICE; if (candidate->params.service_id) - candidate->state = RXRPC_CONN_SERVER_UNSECURED; + candidate->state = RXRPC_CONN_SERVICE_UNSECURED; write_lock_bh(&peer->conn_lock); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 500cdcdc843c..2a25ab425b6f 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -14,15 +14,15 @@ #include #include "ar-internal.h" -static const char *const rxrpc_conn_states[] = { - [RXRPC_CONN_UNUSED] = "Unused ", - [RXRPC_CONN_CLIENT] = "Client ", - [RXRPC_CONN_SERVER_UNSECURED] = "SvUnsec ", - [RXRPC_CONN_SERVER_CHALLENGING] = "SvChall ", - [RXRPC_CONN_SERVER] = "SvSecure", - [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort", - [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort", - [RXRPC_CONN_NETWORK_ERROR] = "NetError", +static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { + [RXRPC_CONN_UNUSED] = "Unused ", + [RXRPC_CONN_CLIENT] = "Client ", + [RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ", + [RXRPC_CONN_SERVICE_CHALLENGING] = "SvChall ", + [RXRPC_CONN_SERVICE] = "SvSecure", + [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort", + [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CONN_NETWORK_ERROR] = "NetError", }; /* -- cgit v1.2.3 From eb9b9d22754d1926771a22638e81384d517c6ce5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Jun 2016 10:32:02 +0100 Subject: rxrpc: Check that the client conns cache is empty before module removal Check that the client conns cache is empty before module removal and bug if not, listing any offending connections that are still present. Unfortunately, if there are connections still around, then the transport socket is still unexpectedly open and active, so we can't just unallocate the connections. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 3 +-- net/rxrpc/ar-internal.h | 1 + net/rxrpc/conn_client.c | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 5d3e795a7c48..d5073eb02498 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -807,8 +807,7 @@ static void __exit af_rxrpc_exit(void) _debug("synchronise RCU"); rcu_barrier(); _debug("destroy locals"); - ASSERT(idr_is_empty(&rxrpc_client_conn_ids)); - idr_destroy(&rxrpc_client_conn_ids); + rxrpc_destroy_client_conn_ids(); rxrpc_destroy_all_locals(); remove_proc_entry("rxrpc_conns", init_net.proc_net); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 3f0d0479a4da..6583a8399c89 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -541,6 +541,7 @@ extern struct idr rxrpc_client_conn_ids; int rxrpc_get_client_connection_id(struct rxrpc_connection *, gfp_t); void rxrpc_put_client_connection_id(struct rxrpc_connection *); +void rxrpc_destroy_client_conn_ids(void); /* * conn_event.c diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 82488d6adb83..be437d5e90ce 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -92,3 +92,22 @@ void rxrpc_put_client_connection_id(struct rxrpc_connection *conn) spin_unlock(&rxrpc_conn_id_lock); } } + +/* + * Destroy the client connection ID tree. + */ +void rxrpc_destroy_client_conn_ids(void) +{ + struct rxrpc_connection *conn; + int id; + + if (!idr_is_empty(&rxrpc_client_conn_ids)) { + idr_for_each_entry(&rxrpc_client_conn_ids, conn, id) { + pr_err("AF_RXRPC: Leaked client conn %p {%d}\n", + conn, atomic_read(&conn->usage)); + } + BUG(); + } + + idr_destroy(&rxrpc_client_conn_ids); +} -- cgit v1.2.3 From 2c4579e4b1d5a6219522c6e970500b2fd43fe1f8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Jun 2016 10:32:03 +0100 Subject: rxrpc: Move usage count getting into rxrpc_queue_conn() Rather than calling rxrpc_get_connection() manually before calling rxrpc_queue_conn(), do it inside the queue wrapper. This allows us to do some important fixes: (1) If the usage count is 0, do nothing. This prevents connections from being reanimated once they're dead. (2) If rxrpc_queue_work() fails because the work item is already queued, retract the usage count increment which would otherwise be lost. (3) Don't take a ref on the connection in the work function. By passing the ref through the work item, this is unnecessary. Doing it in the work function is too late anyway. Previously, connection-directed packets held a ref on the connection, but that's not really the best idea. And another useful changes: (*) Don't need to take a refcount on the connection in the data_ready handler unless we invoke the connection's work item. We're using RCU there so that's otherwise redundant. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 9 ++++++++- net/rxrpc/call_accept.c | 1 - net/rxrpc/conn_event.c | 8 +------- net/rxrpc/input.c | 1 - 4 files changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6583a8399c89..9fc89cdc6ae3 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -584,10 +584,17 @@ static inline void rxrpc_get_connection(struct rxrpc_connection *conn) atomic_inc(&conn->usage); } +static inline +struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn) +{ + return atomic_inc_not_zero(&conn->usage) ? conn : NULL; +} static inline void rxrpc_queue_conn(struct rxrpc_connection *conn) { - rxrpc_queue_work(&conn->processor); + if (rxrpc_get_connection_maybe(conn) && + !rxrpc_queue_work(&conn->processor)) + rxrpc_put_connection(conn); } /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 1c0860df150e..5367dbe9b96f 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -132,7 +132,6 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, _debug("await conn sec"); list_add_tail(&call->accept_link, &rx->secureq); call->conn->state = RXRPC_CONN_SERVICE_CHALLENGING; - rxrpc_get_connection(call->conn); set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); rxrpc_queue_conn(call->conn); } else { diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index b9c39b83eddb..9ceddd3fd5db 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -266,12 +266,8 @@ void rxrpc_process_connection(struct work_struct *work) _enter("{%d}", conn->debug_id); - rxrpc_get_connection(conn); - - if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) { + if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) rxrpc_secure_connection(conn); - rxrpc_put_connection(conn); - } /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ @@ -286,7 +282,6 @@ void rxrpc_process_connection(struct work_struct *work) goto requeue_and_leave; case -ECONNABORTED: default: - rxrpc_put_connection(conn); rxrpc_free_skb(skb); break; } @@ -304,7 +299,6 @@ requeue_and_leave: protocol_error: if (rxrpc_abort_connection(conn, -ret, abort_code) < 0) goto requeue_and_leave; - rxrpc_put_connection(conn); rxrpc_free_skb(skb); _leave(" [EPROTO]"); goto out; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index fe7ff339d7e5..b993f2dc5a09 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -580,7 +580,6 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, { _enter("%p,%p", conn, skb); - rxrpc_get_connection(conn); skb_queue_tail(&conn->rx_queue, skb); rxrpc_queue_conn(conn); } -- cgit v1.2.3 From d1e858c5a392a50c16ce36624203032bdeb3595b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:39 +0100 Subject: rxrpc: Fix handling of connection failure in client call creation If rxrpc_connect_call() fails during the creation of a client connection, there are two bugs that we can hit that need fixing: (1) The call state should be moved to RXRPC_CALL_DEAD before the call cleanup phase is invoked. If not, this can cause an assertion failure later. (2) call->link should be reinitialised after being deleted in rxrpc_new_client_call() - which otherwise leads to a failure later when the call cleanup attempts to delete the link again. Signed-off-by: David Howells --- net/rxrpc/call_object.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index ad933daae13b..6223a7ed831f 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -425,9 +425,10 @@ error: rxrpc_put_call(call); write_lock_bh(&rxrpc_call_lock); - list_del(&call->link); + list_del_init(&call->link); write_unlock_bh(&rxrpc_call_lock); + call->state = RXRPC_CALL_DEAD; rxrpc_put_call(call); _leave(" = %d", ret); return ERR_PTR(ret); @@ -439,6 +440,7 @@ error: */ found_user_ID_now_present: write_unlock(&rx->call_lock); + call->state = RXRPC_CALL_DEAD; rxrpc_put_call(call); _leave(" = -EEXIST [%p]", call); return ERR_PTR(-EEXIST); -- cgit v1.2.3 From e653cfe49cec540529217933e07caf6c0f25ac93 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:38 +0100 Subject: rxrpc: Release a call's connection ref on call disconnection When a call is disconnected, clear the call's pointer to the connection and release the associated ref on that connection. This means that the call no longer pins the connection and the connection can be discarded even before the call is. As the code currently stands, the call struct is effectively pinned by userspace until userspace has enacted a recvmsg() to retrieve the final call state as sk_buffs on the receive queue pin the call to which they're related because: (1) The rxrpc_call struct contains the userspace ID that recvmsg() has to include in the control message buffer to indicate which call is being referred to. This ID must remain valid until the terminal packet is completely read and must be invalidated immediately at that point as userspace is entitled to immediately reuse it. (2) The final ACK to the reply to a client call isn't sent until the last data packet is entirely read (it's probably worth altering this in future to be send the ACK as soon as all the data has been received). This change requires a bit of rearrangement to make sure that the call isn't going to try and access the connection again after protocol completion: (1) Delete the error link earlier when we're releasing the call. Possibly network errors should be distributed via connections at the cost of adding in an access to the rxrpc_connection struct. (2) Remove the call from the connection's call tree before disconnecting the call. The call tree needs to be removed anyway and incoming packets delivered by channel pointer instead. (3) The release call event should be considered last after all other events have been processed so that we don't need access to the connection again. (4) Move the channel_lock taking from rxrpc_release_call() to rxrpc_disconnect_call() where it will be required in future. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 10 +++++----- net/rxrpc/call_object.c | 26 +++++++++----------------- net/rxrpc/conn_object.c | 8 ++++++++ 3 files changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 0ba84295f913..638d66df284a 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -858,11 +858,6 @@ void rxrpc_process_call(struct work_struct *work) iov[0].iov_len = sizeof(whdr); /* deal with events of a final nature */ - if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { - rxrpc_release_call(call); - clear_bit(RXRPC_CALL_EV_RELEASE, &call->events); - } - if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) { enum rxrpc_skb_mark mark; int error; @@ -1144,6 +1139,11 @@ void rxrpc_process_call(struct work_struct *work) goto maybe_reschedule; } + if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { + rxrpc_release_call(call); + clear_bit(RXRPC_CALL_EV_RELEASE, &call->events); + } + /* other events may have been raised since we started checking */ goto maybe_reschedule; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 6223a7ed831f..b43d89c89744 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -628,6 +628,10 @@ void rxrpc_release_call(struct rxrpc_call *call) */ _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); + spin_lock(&conn->params.peer->lock); + hlist_del_init(&call->error_link); + spin_unlock(&conn->params.peer->lock); + write_lock_bh(&rx->call_lock); if (!list_empty(&call->accept_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", @@ -643,25 +647,22 @@ void rxrpc_release_call(struct rxrpc_call *call) write_unlock_bh(&rx->call_lock); /* free up the channel for reuse */ - spin_lock(&conn->channel_lock); write_lock_bh(&conn->lock); write_lock(&call->state_lock); - rxrpc_disconnect_call(call); - - spin_unlock(&conn->channel_lock); - if (call->state < RXRPC_CALL_COMPLETE && call->state != RXRPC_CALL_CLIENT_FINAL_ACK) { _debug("+++ ABORTING STATE %d +++\n", call->state); call->state = RXRPC_CALL_LOCALLY_ABORTED; call->local_abort = RX_CALL_DEAD; - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - rxrpc_queue_call(call); } write_unlock(&call->state_lock); + + rb_erase(&call->conn_node, &conn->calls); write_unlock_bh(&conn->lock); + rxrpc_disconnect_call(call); + /* clean up the Rx queue */ if (!skb_queue_empty(&call->rx_queue) || !skb_queue_empty(&call->rx_oos_queue)) { @@ -817,16 +818,7 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call) return; } - if (call->conn) { - spin_lock(&call->conn->params.peer->lock); - hlist_del_init(&call->error_link); - spin_unlock(&call->conn->params.peer->lock); - - write_lock_bh(&call->conn->lock); - rb_erase(&call->conn_node, &call->conn->calls); - write_unlock_bh(&call->conn->lock); - rxrpc_put_connection(call->conn); - } + ASSERTCMP(call->conn, ==, NULL); /* Remove the call from the hash */ rxrpc_call_hash_del(call); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 0e022dfab034..99d18107421f 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -540,11 +540,19 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) _enter("%d,%d", conn->debug_id, call->channel); + spin_lock(&conn->channel_lock); + if (conn->channels[chan] == call) { rcu_assign_pointer(conn->channels[chan], NULL); atomic_inc(&conn->avail_chans); wake_up(&conn->channel_wq); } + + spin_unlock(&conn->channel_lock); + + call->conn = NULL; + rxrpc_put_connection(conn); + _leave(""); } /* -- cgit v1.2.3 From dee46364ce6fd0815ad9da625783eda21ccf7b06 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Jun 2016 17:11:19 +0100 Subject: rxrpc: Add RCU destruction for connections and calls Add RCU destruction for connections and calls as the RCU lookup from the transport socket data_ready handler is going to come along shortly. Whilst we're at it, move the cleanup workqueue flushing and RCU barrierage into the destruction code for the objects that need it (locals and connections) and add the extra RCU barrier required for connection cleanup. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 19 ------------------- net/rxrpc/ar-internal.h | 4 +++- net/rxrpc/call_object.c | 18 +++++++++++++++--- net/rxrpc/conn_event.c | 5 ++++- net/rxrpc/conn_object.c | 31 +++++++++++++++++++++++++++---- net/rxrpc/local_object.c | 19 +++++++++++-------- 6 files changed, 60 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index d5073eb02498..d6e4e3b69dc3 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -788,26 +788,7 @@ static void __exit af_rxrpc_exit(void) proto_unregister(&rxrpc_proto); rxrpc_destroy_all_calls(); rxrpc_destroy_all_connections(); - ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0); - - /* We need to flush the scheduled work twice because the local endpoint - * records involve a work item in their destruction as they can only be - * destroyed from process context. However, a connection may have a - * work item outstanding - and this will pin the local endpoint record - * until the connection goes away. - * - * Peers don't pin locals and calls pin sockets - which prevents the - * module from being unloaded - so we should only need two flushes. - */ - _debug("flush scheduled work"); - flush_workqueue(rxrpc_workqueue); - _debug("flush scheduled work 2"); - flush_workqueue(rxrpc_workqueue); - _debug("synchronise RCU"); - rcu_barrier(); - _debug("destroy locals"); - rxrpc_destroy_client_conn_ids(); rxrpc_destroy_all_locals(); remove_proc_entry("rxrpc_conns", init_net.proc_net); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9fc89cdc6ae3..b401fa9d7963 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -292,9 +292,10 @@ struct rxrpc_connection { struct rxrpc_conn_parameters params; spinlock_t channel_lock; - struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* active calls */ + struct rxrpc_call __rcu *channels[RXRPC_MAXCALLS]; /* active calls */ wait_queue_head_t channel_wq; /* queue to wait for channel to become available */ + struct rcu_head rcu; struct work_struct processor; /* connection event processor */ union { struct rb_node client_node; /* Node in local->client_conns */ @@ -398,6 +399,7 @@ enum rxrpc_call_state { * - matched by { connection, call_id } */ struct rxrpc_call { + struct rcu_head rcu; struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_sock *socket; /* socket responsible */ struct timer_list lifetimer; /* lifetime remaining on call */ diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index b43d89c89744..2c6c57c0d52c 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -480,7 +480,8 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, write_lock_bh(&conn->lock); /* set the channel for this call */ - call = conn->channels[candidate->channel]; + call = rcu_dereference_protected(conn->channels[candidate->channel], + lockdep_is_held(&conn->lock)); _debug("channel[%u] is %p", candidate->channel, call); if (call && call->call_id == sp->hdr.callNumber) { /* already set; must've been a duplicate packet */ @@ -544,7 +545,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, candidate = NULL; rb_link_node(&call->conn_node, parent, p); rb_insert_color(&call->conn_node, &conn->calls); - conn->channels[call->channel] = call; + rcu_assign_pointer(conn->channels[call->channel], call); sock_hold(&rx->sk); rxrpc_get_connection(conn); write_unlock_bh(&conn->lock); @@ -794,6 +795,17 @@ void __rxrpc_put_call(struct rxrpc_call *call) _leave(""); } +/* + * Final call destruction under RCU. + */ +static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) +{ + struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); + + rxrpc_purge_queue(&call->rx_queue); + kmem_cache_free(rxrpc_call_jar, call); +} + /* * clean up a call */ @@ -849,7 +861,7 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call) rxrpc_purge_queue(&call->rx_queue); ASSERT(skb_queue_empty(&call->rx_oos_queue)); sock_put(&call->socket->sk); - kmem_cache_free(rxrpc_call_jar, call); + call_rcu(&call->rcu, rxrpc_rcu_destroy_call); } /* diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 9ceddd3fd5db..f6ca8c5c4496 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -198,7 +198,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { conn->state = RXRPC_CONN_SERVICE; for (loop = 0; loop < RXRPC_MAXCALLS; loop++) - rxrpc_call_is_secure(conn->channels[loop]); + rxrpc_call_is_secure( + rcu_dereference_protected( + conn->channels[loop], + lockdep_is_held(&conn->lock))); } spin_unlock(&conn->state_lock); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 99d18107421f..0165a629388b 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -542,7 +542,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) spin_lock(&conn->channel_lock); - if (conn->channels[chan] == call) { + if (rcu_access_pointer(conn->channels[chan]) == call) { rcu_assign_pointer(conn->channels[chan], NULL); atomic_inc(&conn->avail_chans); wake_up(&conn->channel_wq); @@ -580,9 +580,12 @@ void rxrpc_put_connection(struct rxrpc_connection *conn) /* * destroy a virtual connection */ -static void rxrpc_destroy_connection(struct rxrpc_connection *conn) +static void rxrpc_destroy_connection(struct rcu_head *rcu) { - _enter("%p{%d}", conn, atomic_read(&conn->usage)); + struct rxrpc_connection *conn = + container_of(rcu, struct rxrpc_connection, rcu); + + _enter("{%d,u=%d}", conn->debug_id, atomic_read(&conn->usage)); ASSERTCMP(atomic_read(&conn->usage), ==, 0); @@ -677,7 +680,8 @@ static void rxrpc_connection_reaper(struct work_struct *work) list_del_init(&conn->link); ASSERTCMP(atomic_read(&conn->usage), ==, 0); - rxrpc_destroy_connection(conn); + skb_queue_purge(&conn->rx_queue); + call_rcu(&conn->rcu, rxrpc_destroy_connection); } _leave(""); @@ -689,11 +693,30 @@ static void rxrpc_connection_reaper(struct work_struct *work) */ void __exit rxrpc_destroy_all_connections(void) { + struct rxrpc_connection *conn, *_p; + bool leak = false; + _enter(""); rxrpc_connection_expiry = 0; cancel_delayed_work(&rxrpc_connection_reap); rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + flush_workqueue(rxrpc_workqueue); + + write_lock(&rxrpc_connection_lock); + list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) { + pr_err("AF_RXRPC: Leaked conn %p {%d}\n", + conn, atomic_read(&conn->usage)); + leak = true; + } + write_unlock(&rxrpc_connection_lock); + BUG_ON(leak); + + /* Make sure the local and peer records pinned by any dying connections + * are released. + */ + rcu_barrier(); + rxrpc_destroy_client_conn_ids(); _leave(""); } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 3ab7764f7cd8..a753796fbe8f 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -374,14 +374,17 @@ void __exit rxrpc_destroy_all_locals(void) _enter(""); - if (list_empty(&rxrpc_local_endpoints)) - return; + flush_workqueue(rxrpc_workqueue); - mutex_lock(&rxrpc_local_mutex); - list_for_each_entry(local, &rxrpc_local_endpoints, link) { - pr_err("AF_RXRPC: Leaked local %p {%d}\n", - local, atomic_read(&local->usage)); + if (!list_empty(&rxrpc_local_endpoints)) { + mutex_lock(&rxrpc_local_mutex); + list_for_each_entry(local, &rxrpc_local_endpoints, link) { + pr_err("AF_RXRPC: Leaked local %p {%d}\n", + local, atomic_read(&local->usage)); + } + mutex_unlock(&rxrpc_local_mutex); + BUG(); } - mutex_unlock(&rxrpc_local_mutex); - BUG(); + + rcu_barrier(); } -- cgit v1.2.3 From 30b515f4d1cf31f6901c1fa61d920f651ebc07d7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 28 Jun 2016 16:58:36 +0100 Subject: rxrpc: Access socket accept queue under right lock The socket's accept queue (socket->acceptq) should be accessed under socket->call_lock, not under the connection lock. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 638d66df284a..fc32aa5764a2 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -1089,7 +1089,7 @@ void rxrpc_process_call(struct work_struct *work) if (call->state == RXRPC_CALL_SERVER_SECURING) { _debug("securing"); - write_lock(&call->conn->lock); + write_lock(&call->socket->call_lock); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { _debug("not released"); @@ -1097,7 +1097,7 @@ void rxrpc_process_call(struct work_struct *work) list_move_tail(&call->accept_link, &call->socket->acceptq); } - write_unlock(&call->conn->lock); + write_unlock(&call->socket->call_lock); read_lock(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); -- cgit v1.2.3 From a1399f8bb0331a1f50c76c4cac738fe57679b9bb Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Jun 2016 14:39:44 +0100 Subject: rxrpc: Call channels should have separate call number spaces Each channel on a connection has a separate, independent number space from which to allocate callNumber values. It is entirely possible, for example, to have a connection with four active calls, each with call number 1. Note that the callNumber values for any particular channel don't have to start at 1, but they are supposed to increment monotonically for that channel from a client's perspective and may not be reused once the call number is transmitted (until the epoch cycles all the way back round). Currently, however, call numbers are allocated on a per-connection basis and, further, are held in an rb-tree. The rb-tree is redundant as the four channel pointers in the rxrpc_connection struct are entirely capable of pointing to all the calls currently in progress on a connection. To this end, make the following changes: (1) Handle call number allocation independently per channel. (2) Get rid of the conn->calls rb-tree. This is overkill as a connection may have a maximum of four calls in progress at any one time. Use the pointers in the channels[] array instead, indexed by the channel number from the packet. (3) For each channel, save the result of the last call that was in progress on that channel in conn->channels[] so that the final ACK or ABORT packet can be replayed if necessary. Any call earlier than that is just ignored. If we've seen the next call number in a packet, the last one is most definitely defunct. (4) When generating a RESPONSE packet for a connection, the call number counter for each channel must be included in it. (5) When parsing a RESPONSE packet for a connection, the call number counters contained therein should be used to set the minimum expected call numbers on each channel. To do in future commits: (1) Replay terminal packets based on the last call stored in conn->channels[]. (2) Connections should be retired before the callNumber space on any channel runs out. (3) A server is expected to disregard or reject any new incoming call that has a call number less than the current call number counter. The call number counter for that channel must be advanced to the new call number. Note that the server cannot just require that the next call that it sees on a channel be exactly the call number counter + 1 because then there's a scenario that could cause a problem: The client transmits a packet to initiate a connection, the network goes out, the server sends an ACK (which gets lost), the client sends an ABORT (which also gets lost); the network then reconnects, the client then reuses the call number for the next call (it doesn't know the server already saw the call number), but the server thinks it already has the first packet of this call (it doesn't know that the client doesn't know that it saw the call number the first time). Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 14 +++++---- net/rxrpc/call_object.c | 60 +++++++++++++++----------------------- net/rxrpc/conn_event.c | 24 ++++++++------- net/rxrpc/conn_object.c | 77 +++++++++++++++++++------------------------------ net/rxrpc/proc.c | 5 ++-- net/rxrpc/rxkad.c | 41 +++++++++++++++++--------- 6 files changed, 104 insertions(+), 117 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index b401fa9d7963..b697654340a8 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -292,7 +292,14 @@ struct rxrpc_connection { struct rxrpc_conn_parameters params; spinlock_t channel_lock; - struct rxrpc_call __rcu *channels[RXRPC_MAXCALLS]; /* active calls */ + + struct rxrpc_channel { + struct rxrpc_call __rcu *call; /* Active call */ + u32 call_id; /* ID of current call */ + u32 call_counter; /* Call ID counter */ + u32 last_call; /* ID of last call */ + u32 last_result; /* Result of last call (0/abort) */ + } channels[RXRPC_MAXCALLS]; wait_queue_head_t channel_wq; /* queue to wait for channel to become available */ struct rcu_head rcu; @@ -302,7 +309,6 @@ struct rxrpc_connection { struct rb_node service_node; /* Node in peer->service_conns */ }; struct list_head link; /* link in master connection list */ - struct rb_root calls; /* calls on this connection */ struct sk_buff_head rx_queue; /* received conn-level packets */ const struct rxrpc_security *security; /* applied security module */ struct key *server_key; /* security for this service */ @@ -311,7 +317,6 @@ struct rxrpc_connection { unsigned long flags; unsigned long events; unsigned long put_time; /* Time at which last put */ - rwlock_t lock; /* access lock */ spinlock_t state_lock; /* state-change lock */ atomic_t usage; enum rxrpc_conn_proto_state state : 8; /* current state of connection */ @@ -319,7 +324,6 @@ struct rxrpc_connection { u32 remote_abort; /* remote abort code */ int error; /* local error incurred */ int debug_id; /* debug ID for printks */ - unsigned int call_counter; /* call ID counter */ atomic_t serial; /* packet serial number counter */ atomic_t hi_serial; /* highest serial number received */ atomic_t avail_chans; /* number of channels available */ @@ -412,7 +416,6 @@ struct rxrpc_call { struct hlist_node error_link; /* link in error distribution list */ struct list_head accept_link; /* calls awaiting acceptance */ struct rb_node sock_node; /* node in socket call tree */ - struct rb_node conn_node; /* node in connection call tree */ struct sk_buff_head rx_queue; /* received packets */ struct sk_buff_head rx_oos_queue; /* packets received out of sequence */ struct sk_buff *tx_pending; /* Tx socket buffer being filled */ @@ -564,6 +567,7 @@ int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *, struct rxrpc_peer *, struct sk_buff *); +void __rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_put_connection(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 2c6c57c0d52c..3f278721269e 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -456,8 +456,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_call *call, *candidate; - struct rb_node **p, *parent; - u32 call_id; + u32 call_id, chan; _enter(",%d", conn->debug_id); @@ -467,21 +466,23 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, if (!candidate) return ERR_PTR(-EBUSY); + chan = sp->hdr.cid & RXRPC_CHANNELMASK; candidate->socket = rx; candidate->conn = conn; candidate->cid = sp->hdr.cid; candidate->call_id = sp->hdr.callNumber; - candidate->channel = sp->hdr.cid & RXRPC_CHANNELMASK; + candidate->channel = chan; candidate->rx_data_post = 0; candidate->state = RXRPC_CALL_SERVER_ACCEPTING; if (conn->security_ix > 0) candidate->state = RXRPC_CALL_SERVER_SECURING; - write_lock_bh(&conn->lock); + spin_lock(&conn->channel_lock); /* set the channel for this call */ - call = rcu_dereference_protected(conn->channels[candidate->channel], - lockdep_is_held(&conn->lock)); + call = rcu_dereference_protected(conn->channels[chan].call, + lockdep_is_held(&conn->channel_lock)); + _debug("channel[%u] is %p", candidate->channel, call); if (call && call->call_id == sp->hdr.callNumber) { /* already set; must've been a duplicate packet */ @@ -510,9 +511,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, call->debug_id, rxrpc_call_states[call->state]); if (call->state >= RXRPC_CALL_COMPLETE) { - conn->channels[call->channel] = NULL; + __rxrpc_disconnect_call(call); } else { - write_unlock_bh(&conn->lock); + spin_unlock(&conn->channel_lock); kmem_cache_free(rxrpc_call_jar, candidate); _leave(" = -EBUSY"); return ERR_PTR(-EBUSY); @@ -522,33 +523,22 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, /* check the call number isn't duplicate */ _debug("check dup"); call_id = sp->hdr.callNumber; - p = &conn->calls.rb_node; - parent = NULL; - while (*p) { - parent = *p; - call = rb_entry(parent, struct rxrpc_call, conn_node); - - /* The tree is sorted in order of the __be32 value without - * turning it into host order. - */ - if (call_id < call->call_id) - p = &(*p)->rb_left; - else if (call_id > call->call_id) - p = &(*p)->rb_right; - else - goto old_call; - } + + /* We just ignore calls prior to the current call ID. Terminated calls + * are handled via the connection. + */ + if (call_id <= conn->channels[chan].call_counter) + goto old_call; /* TODO: Just drop packet */ /* make the call available */ _debug("new call"); call = candidate; candidate = NULL; - rb_link_node(&call->conn_node, parent, p); - rb_insert_color(&call->conn_node, &conn->calls); - rcu_assign_pointer(conn->channels[call->channel], call); + conn->channels[chan].call_counter = call_id; + rcu_assign_pointer(conn->channels[chan].call, call); sock_hold(&rx->sk); rxrpc_get_connection(conn); - write_unlock_bh(&conn->lock); + spin_unlock(&conn->channel_lock); spin_lock(&conn->params.peer->lock); hlist_add_head(&call->error_link, &conn->params.peer->error_targets); @@ -588,19 +578,19 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, return call; extant_call: - write_unlock_bh(&conn->lock); + spin_unlock(&conn->channel_lock); kmem_cache_free(rxrpc_call_jar, candidate); _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1); return call; aborted_call: - write_unlock_bh(&conn->lock); + spin_unlock(&conn->channel_lock); kmem_cache_free(rxrpc_call_jar, candidate); _leave(" = -ECONNABORTED"); return ERR_PTR(-ECONNABORTED); old_call: - write_unlock_bh(&conn->lock); + spin_unlock(&conn->channel_lock); kmem_cache_free(rxrpc_call_jar, candidate); _leave(" = -ECONNRESET [old]"); return ERR_PTR(-ECONNRESET); @@ -648,8 +638,7 @@ void rxrpc_release_call(struct rxrpc_call *call) write_unlock_bh(&rx->call_lock); /* free up the channel for reuse */ - write_lock_bh(&conn->lock); - write_lock(&call->state_lock); + write_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE && call->state != RXRPC_CALL_CLIENT_FINAL_ACK) { @@ -657,10 +646,7 @@ void rxrpc_release_call(struct rxrpc_call *call) call->state = RXRPC_CALL_LOCALLY_ABORTED; call->local_abort = RX_CALL_DEAD; } - write_unlock(&call->state_lock); - - rb_erase(&call->conn_node, &conn->calls); - write_unlock_bh(&conn->lock); + write_unlock_bh(&call->state_lock); rxrpc_disconnect_call(call); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index f6ca8c5c4496..cee0f35bc1cf 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -31,15 +31,17 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, u32 abort_code) { struct rxrpc_call *call; - struct rb_node *p; + int i; _enter("{%d},%x", conn->debug_id, abort_code); - read_lock_bh(&conn->lock); + spin_lock(&conn->channel_lock); - for (p = rb_first(&conn->calls); p; p = rb_next(p)) { - call = rb_entry(p, struct rxrpc_call, conn_node); - write_lock(&call->state_lock); + for (i = 0; i < RXRPC_MAXCALLS; i++) { + call = rcu_dereference_protected( + conn->channels[i].call, + lockdep_is_held(&conn->channel_lock)); + write_lock_bh(&call->state_lock); if (call->state <= RXRPC_CALL_COMPLETE) { call->state = state; if (state == RXRPC_CALL_LOCALLY_ABORTED) { @@ -51,10 +53,10 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, } rxrpc_queue_call(call); } - write_unlock(&call->state_lock); + write_unlock_bh(&call->state_lock); } - read_unlock_bh(&conn->lock); + spin_unlock(&conn->channel_lock); _leave(""); } @@ -192,7 +194,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - read_lock_bh(&conn->lock); + spin_lock(&conn->channel_lock); spin_lock(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { @@ -200,12 +202,12 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, for (loop = 0; loop < RXRPC_MAXCALLS; loop++) rxrpc_call_is_secure( rcu_dereference_protected( - conn->channels[loop], - lockdep_is_held(&conn->lock))); + conn->channels[loop].call, + lockdep_is_held(&conn->channel_lock))); } spin_unlock(&conn->state_lock); - read_unlock_bh(&conn->lock); + spin_unlock(&conn->channel_lock); return 0; default: diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 0165a629388b..ce83f3e44da2 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -46,10 +46,8 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) init_waitqueue_head(&conn->channel_wq); INIT_WORK(&conn->processor, &rxrpc_process_connection); INIT_LIST_HEAD(&conn->link); - conn->calls = RB_ROOT; skb_queue_head_init(&conn->rx_queue); conn->security = &rxrpc_no_security; - rwlock_init(&conn->lock); spin_lock_init(&conn->state_lock); atomic_set(&conn->usage, 1); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); @@ -62,39 +60,6 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) return conn; } -/* - * add a call to a connection's call-by-ID tree - */ -static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, - struct rxrpc_call *call) -{ - struct rxrpc_call *xcall; - struct rb_node *parent, **p; - u32 call_id; - - write_lock_bh(&conn->lock); - - call_id = call->call_id; - p = &conn->calls.rb_node; - parent = NULL; - while (*p) { - parent = *p; - xcall = rb_entry(parent, struct rxrpc_call, conn_node); - - if (call_id < xcall->call_id) - p = &(*p)->rb_left; - else if (call_id > xcall->call_id) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&call->conn_node, parent, p); - rb_insert_color(&call->conn_node, &conn->calls); - - write_unlock_bh(&conn->lock); -} - /* * Allocate a client connection. The caller must take care to clear any * padding bytes in *cp. @@ -277,12 +242,12 @@ found_channel: call->channel = chan; call->epoch = conn->proto.epoch; call->cid = conn->proto.cid | chan; - call->call_id = ++conn->call_counter; - rcu_assign_pointer(conn->channels[chan], call); + call->call_id = ++conn->channels[chan].call_counter; + conn->channels[chan].call_id = call->call_id; + rcu_assign_pointer(conn->channels[chan].call, call); _net("CONNECT call %d on conn %d", call->debug_id, conn->debug_id); - rxrpc_add_call_ID_to_conn(conn, call); spin_unlock(&conn->channel_lock); rxrpc_put_peer(cp->peer); cp->peer = NULL; @@ -326,7 +291,7 @@ found_extant_conn: spin_lock(&conn->channel_lock); for (chan = 0; chan < RXRPC_MAXCALLS; chan++) - if (!conn->channels[chan]) + if (!conn->channels[chan].call) goto found_channel; BUG(); @@ -531,28 +496,47 @@ found: /* * Disconnect a call and clear any channel it occupies when that call - * terminates. + * terminates. The caller must hold the channel_lock and must release the + * call's ref on the connection. */ -void rxrpc_disconnect_call(struct rxrpc_call *call) +void __rxrpc_disconnect_call(struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; - unsigned chan = call->channel; + struct rxrpc_channel *chan = &conn->channels[call->channel]; _enter("%d,%d", conn->debug_id, call->channel); - spin_lock(&conn->channel_lock); + if (rcu_access_pointer(chan->call) == call) { + /* Save the result of the call so that we can repeat it if necessary + * through the channel, whilst disposing of the actual call record. + */ + chan->last_result = call->local_abort; + smp_wmb(); + chan->last_call = chan->call_id; + chan->call_id = chan->call_counter; - if (rcu_access_pointer(conn->channels[chan]) == call) { - rcu_assign_pointer(conn->channels[chan], NULL); + rcu_assign_pointer(chan->call, NULL); atomic_inc(&conn->avail_chans); wake_up(&conn->channel_wq); } + _leave(""); +} + +/* + * Disconnect a call and clear any channel it occupies when that call + * terminates. + */ +void rxrpc_disconnect_call(struct rxrpc_call *call) +{ + struct rxrpc_connection *conn = call->conn; + + spin_lock(&conn->channel_lock); + __rxrpc_disconnect_call(call); spin_unlock(&conn->channel_lock); call->conn = NULL; rxrpc_put_connection(conn); - _leave(""); } /* @@ -591,7 +575,6 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) _net("DESTROY CONN %d", conn->debug_id); - ASSERT(RB_EMPTY_ROOT(&conn->calls)); rxrpc_purge_queue(&conn->rx_queue); conn->security->clear(conn); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 2a25ab425b6f..ced5f07444e5 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -137,7 +137,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) if (v == &rxrpc_connections) { seq_puts(seq, "Proto Local Remote " - " SvID ConnID Calls End Use State Key " + " SvID ConnID End Use State Key " " Serial ISerial\n" ); return 0; @@ -154,13 +154,12 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) ntohs(conn->params.peer->srx.transport.sin.sin_port)); seq_printf(seq, - "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" + "UDP %-22.22s %-22.22s %4x %08x %s %3u" " %s %08x %08x %08x\n", lbuff, rbuff, conn->params.service_id, conn->proto.cid, - conn->call_counter, rxrpc_conn_is_service(conn) ? "Svc" : "Clt", atomic_read(&conn->usage), rxrpc_conn_states[conn->state], diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 3acc7c1241d4..63afa9e9cc08 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -767,14 +767,10 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, resp.kvno = htonl(token->kad->kvno); resp.ticket_len = htonl(token->kad->ticket_len); - resp.encrypted.call_id[0] = - htonl(conn->channels[0] ? conn->channels[0]->call_id : 0); - resp.encrypted.call_id[1] = - htonl(conn->channels[1] ? conn->channels[1]->call_id : 0); - resp.encrypted.call_id[2] = - htonl(conn->channels[2] ? conn->channels[2]->call_id : 0); - resp.encrypted.call_id[3] = - htonl(conn->channels[3] ? conn->channels[3]->call_id : 0); + resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter); + resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter); + resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter); + resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter); /* calculate the response checksum and then do the encryption */ rxkad_calc_response_checksum(&resp); @@ -991,7 +987,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, void *ticket; u32 abort_code, version, kvno, ticket_len, level; __be32 csum; - int ret; + int ret, i; _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); @@ -1054,11 +1050,26 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, if (response.encrypted.checksum != csum) goto protocol_error_free; - if (ntohl(response.encrypted.call_id[0]) > INT_MAX || - ntohl(response.encrypted.call_id[1]) > INT_MAX || - ntohl(response.encrypted.call_id[2]) > INT_MAX || - ntohl(response.encrypted.call_id[3]) > INT_MAX) - goto protocol_error_free; + spin_lock(&conn->channel_lock); + for (i = 0; i < RXRPC_MAXCALLS; i++) { + struct rxrpc_call *call; + u32 call_id = ntohl(response.encrypted.call_id[i]); + + if (call_id > INT_MAX) + goto protocol_error_unlock; + + if (call_id < conn->channels[i].call_counter) + goto protocol_error_unlock; + if (call_id > conn->channels[i].call_counter) { + call = rcu_dereference_protected( + conn->channels[i].call, + lockdep_is_held(&conn->channel_lock)); + if (call && call->state < RXRPC_CALL_COMPLETE) + goto protocol_error_unlock; + conn->channels[i].call_counter = call_id; + } + } + spin_unlock(&conn->channel_lock); abort_code = RXKADOUTOFSEQUENCE; if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1) @@ -1083,6 +1094,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, _leave(" = 0"); return 0; +protocol_error_unlock: + spin_unlock(&conn->channel_lock); protocol_error_free: kfree(ticket); protocol_error: -- cgit v1.2.3 From c6d2b8d764f5edd79f708bdc49d1176072ee77a1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:40 +0100 Subject: rxrpc: Split client connection code out into its own file Split the client-specific connection code out into its own file. It will behave somewhat differently from the service-specific connection code, so it makes sense to separate them. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 6 +- net/rxrpc/conn_client.c | 248 +++++++++++++++++++++++++++++++++++++++++++++++- net/rxrpc/conn_object.c | 247 +---------------------------------------------- 3 files changed, 251 insertions(+), 250 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index b697654340a8..021d28b54282 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -544,9 +544,10 @@ void __exit rxrpc_destroy_all_calls(void); */ extern struct idr rxrpc_client_conn_ids; -int rxrpc_get_client_connection_id(struct rxrpc_connection *, gfp_t); void rxrpc_put_client_connection_id(struct rxrpc_connection *); void rxrpc_destroy_client_conn_ids(void); +int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, + struct sockaddr_rxrpc *, gfp_t); /* * conn_event.c @@ -562,8 +563,7 @@ extern unsigned int rxrpc_connection_expiry; extern struct list_head rxrpc_connections; extern rwlock_t rxrpc_connection_lock; -int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, - struct sockaddr_rxrpc *, gfp_t); +struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *, struct rxrpc_peer *, struct sk_buff *); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index be437d5e90ce..9180164a51aa 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -33,7 +33,8 @@ static DEFINE_SPINLOCK(rxrpc_conn_id_lock); * client conns away from the current allocation point to try and keep the IDs * concentrated. We will also need to retire connections from an old epoch. */ -int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, gfp_t gfp) +static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, + gfp_t gfp) { u32 epoch; int id; @@ -111,3 +112,248 @@ void rxrpc_destroy_client_conn_ids(void) idr_destroy(&rxrpc_client_conn_ids); } + +/* + * Allocate a client connection. The caller must take care to clear any + * padding bytes in *cp. + */ +static struct rxrpc_connection * +rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) +{ + struct rxrpc_connection *conn; + int ret; + + _enter(""); + + conn = rxrpc_alloc_connection(gfp); + if (!conn) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + conn->params = *cp; + conn->proto.local = cp->local; + conn->proto.epoch = rxrpc_epoch; + conn->proto.cid = 0; + conn->proto.in_clientflag = 0; + conn->proto.family = cp->peer->srx.transport.family; + conn->out_clientflag = RXRPC_CLIENT_INITIATED; + conn->state = RXRPC_CONN_CLIENT; + + switch (conn->proto.family) { + case AF_INET: + conn->proto.addr_size = sizeof(conn->proto.ipv4_addr); + conn->proto.ipv4_addr = cp->peer->srx.transport.sin.sin_addr; + conn->proto.port = cp->peer->srx.transport.sin.sin_port; + break; + } + + ret = rxrpc_get_client_connection_id(conn, gfp); + if (ret < 0) + goto error_0; + + ret = rxrpc_init_client_conn_security(conn); + if (ret < 0) + goto error_1; + + ret = conn->security->prime_packet_security(conn); + if (ret < 0) + goto error_2; + + write_lock(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + write_unlock(&rxrpc_connection_lock); + + /* We steal the caller's peer ref. */ + cp->peer = NULL; + rxrpc_get_local(conn->params.local); + key_get(conn->params.key); + + _leave(" = %p", conn); + return conn; + +error_2: + conn->security->clear(conn); +error_1: + rxrpc_put_client_connection_id(conn); +error_0: + kfree(conn); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * find a connection for a call + * - called in process context with IRQs enabled + */ +int rxrpc_connect_call(struct rxrpc_call *call, + struct rxrpc_conn_parameters *cp, + struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + struct rxrpc_connection *conn, *candidate = NULL; + struct rxrpc_local *local = cp->local; + struct rb_node *p, **pp, *parent; + long diff; + int chan; + + DECLARE_WAITQUEUE(myself, current); + + _enter("{%d,%lx},", call->debug_id, call->user_call_ID); + + cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); + if (!cp->peer) + return -ENOMEM; + + if (!cp->exclusive) { + /* Search for a existing client connection unless this is going + * to be a connection that's used exclusively for a single call. + */ + _debug("search 1"); + spin_lock(&local->client_conns_lock); + p = local->client_conns.rb_node; + while (p) { + conn = rb_entry(p, struct rxrpc_connection, client_node); + +#define cmp(X) ((long)conn->params.X - (long)cp->X) + diff = (cmp(peer) ?: + cmp(key) ?: + cmp(security_level)); + if (diff < 0) + p = p->rb_left; + else if (diff > 0) + p = p->rb_right; + else + goto found_extant_conn; + } + spin_unlock(&local->client_conns_lock); + } + + /* We didn't find a connection or we want an exclusive one. */ + _debug("get new conn"); + candidate = rxrpc_alloc_client_connection(cp, gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + if (cp->exclusive) { + /* Assign the call on an exclusive connection to channel 0 and + * don't add the connection to the endpoint's shareable conn + * lookup tree. + */ + _debug("exclusive chan 0"); + conn = candidate; + atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); + spin_lock(&conn->channel_lock); + chan = 0; + goto found_channel; + } + + /* We need to redo the search before attempting to add a new connection + * lest we race with someone else adding a conflicting instance. + */ + _debug("search 2"); + spin_lock(&local->client_conns_lock); + + pp = &local->client_conns.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + conn = rb_entry(parent, struct rxrpc_connection, client_node); + + diff = (cmp(peer) ?: + cmp(key) ?: + cmp(security_level)); + if (diff < 0) + pp = &(*pp)->rb_left; + else if (diff > 0) + pp = &(*pp)->rb_right; + else + goto found_extant_conn; + } + + /* The second search also failed; simply add the new connection with + * the new call in channel 0. Note that we need to take the channel + * lock before dropping the client conn lock. + */ + _debug("new conn"); + conn = candidate; + candidate = NULL; + + rb_link_node(&conn->client_node, parent, pp); + rb_insert_color(&conn->client_node, &local->client_conns); + + atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); + spin_lock(&conn->channel_lock); + spin_unlock(&local->client_conns_lock); + chan = 0; + +found_channel: + _debug("found chan"); + call->conn = conn; + call->channel = chan; + call->epoch = conn->proto.epoch; + call->cid = conn->proto.cid | chan; + call->call_id = ++conn->channels[chan].call_counter; + conn->channels[chan].call_id = call->call_id; + rcu_assign_pointer(conn->channels[chan].call, call); + + _net("CONNECT call %d on conn %d", call->debug_id, conn->debug_id); + + spin_unlock(&conn->channel_lock); + rxrpc_put_peer(cp->peer); + cp->peer = NULL; + _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); + return 0; + + /* We found a suitable connection already in existence. Discard any + * candidate we may have allocated, and try to get a channel on this + * one. + */ +found_extant_conn: + _debug("found conn"); + rxrpc_get_connection(conn); + spin_unlock(&local->client_conns_lock); + + rxrpc_put_connection(candidate); + + if (!atomic_add_unless(&conn->avail_chans, -1, 0)) { + if (!gfpflags_allow_blocking(gfp)) { + rxrpc_put_connection(conn); + _leave(" = -EAGAIN"); + return -EAGAIN; + } + + add_wait_queue(&conn->channel_wq, &myself); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (atomic_add_unless(&conn->avail_chans, -1, 0)) + break; + if (signal_pending(current)) + goto interrupted; + schedule(); + } + remove_wait_queue(&conn->channel_wq, &myself); + __set_current_state(TASK_RUNNING); + } + + /* The connection allegedly now has a free channel and we can now + * attach the call to it. + */ + spin_lock(&conn->channel_lock); + + for (chan = 0; chan < RXRPC_MAXCALLS; chan++) + if (!conn->channels[chan].call) + goto found_channel; + BUG(); + +interrupted: + remove_wait_queue(&conn->channel_wq, &myself); + __set_current_state(TASK_RUNNING); + rxrpc_put_connection(conn); + rxrpc_put_peer(cp->peer); + cp->peer = NULL; + _leave(" = -ERESTARTSYS"); + return -ERESTARTSYS; +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index ce83f3e44da2..ab5c8c2960e4 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -34,7 +34,7 @@ static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); /* * allocate a new connection */ -static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) +struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) { struct rxrpc_connection *conn; @@ -60,251 +60,6 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) return conn; } -/* - * Allocate a client connection. The caller must take care to clear any - * padding bytes in *cp. - */ -static struct rxrpc_connection * -rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) -{ - struct rxrpc_connection *conn; - int ret; - - _enter(""); - - conn = rxrpc_alloc_connection(gfp); - if (!conn) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } - - conn->params = *cp; - conn->proto.local = cp->local; - conn->proto.epoch = rxrpc_epoch; - conn->proto.cid = 0; - conn->proto.in_clientflag = 0; - conn->proto.family = cp->peer->srx.transport.family; - conn->out_clientflag = RXRPC_CLIENT_INITIATED; - conn->state = RXRPC_CONN_CLIENT; - - switch (conn->proto.family) { - case AF_INET: - conn->proto.addr_size = sizeof(conn->proto.ipv4_addr); - conn->proto.ipv4_addr = cp->peer->srx.transport.sin.sin_addr; - conn->proto.port = cp->peer->srx.transport.sin.sin_port; - break; - } - - ret = rxrpc_get_client_connection_id(conn, gfp); - if (ret < 0) - goto error_0; - - ret = rxrpc_init_client_conn_security(conn); - if (ret < 0) - goto error_1; - - ret = conn->security->prime_packet_security(conn); - if (ret < 0) - goto error_2; - - write_lock(&rxrpc_connection_lock); - list_add_tail(&conn->link, &rxrpc_connections); - write_unlock(&rxrpc_connection_lock); - - /* We steal the caller's peer ref. */ - cp->peer = NULL; - rxrpc_get_local(conn->params.local); - key_get(conn->params.key); - - _leave(" = %p", conn); - return conn; - -error_2: - conn->security->clear(conn); -error_1: - rxrpc_put_client_connection_id(conn); -error_0: - kfree(conn); - _leave(" = %d", ret); - return ERR_PTR(ret); -} - -/* - * find a connection for a call - * - called in process context with IRQs enabled - */ -int rxrpc_connect_call(struct rxrpc_call *call, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) -{ - struct rxrpc_connection *conn, *candidate = NULL; - struct rxrpc_local *local = cp->local; - struct rb_node *p, **pp, *parent; - long diff; - int chan; - - DECLARE_WAITQUEUE(myself, current); - - _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - - cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); - if (!cp->peer) - return -ENOMEM; - - if (!cp->exclusive) { - /* Search for a existing client connection unless this is going - * to be a connection that's used exclusively for a single call. - */ - _debug("search 1"); - spin_lock(&local->client_conns_lock); - p = local->client_conns.rb_node; - while (p) { - conn = rb_entry(p, struct rxrpc_connection, client_node); - -#define cmp(X) ((long)conn->params.X - (long)cp->X) - diff = (cmp(peer) ?: - cmp(key) ?: - cmp(security_level)); - if (diff < 0) - p = p->rb_left; - else if (diff > 0) - p = p->rb_right; - else - goto found_extant_conn; - } - spin_unlock(&local->client_conns_lock); - } - - /* We didn't find a connection or we want an exclusive one. */ - _debug("get new conn"); - candidate = rxrpc_alloc_client_connection(cp, gfp); - if (!candidate) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - if (cp->exclusive) { - /* Assign the call on an exclusive connection to channel 0 and - * don't add the connection to the endpoint's shareable conn - * lookup tree. - */ - _debug("exclusive chan 0"); - conn = candidate; - atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); - spin_lock(&conn->channel_lock); - chan = 0; - goto found_channel; - } - - /* We need to redo the search before attempting to add a new connection - * lest we race with someone else adding a conflicting instance. - */ - _debug("search 2"); - spin_lock(&local->client_conns_lock); - - pp = &local->client_conns.rb_node; - parent = NULL; - while (*pp) { - parent = *pp; - conn = rb_entry(parent, struct rxrpc_connection, client_node); - - diff = (cmp(peer) ?: - cmp(key) ?: - cmp(security_level)); - if (diff < 0) - pp = &(*pp)->rb_left; - else if (diff > 0) - pp = &(*pp)->rb_right; - else - goto found_extant_conn; - } - - /* The second search also failed; simply add the new connection with - * the new call in channel 0. Note that we need to take the channel - * lock before dropping the client conn lock. - */ - _debug("new conn"); - conn = candidate; - candidate = NULL; - - rb_link_node(&conn->client_node, parent, pp); - rb_insert_color(&conn->client_node, &local->client_conns); - - atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); - spin_lock(&conn->channel_lock); - spin_unlock(&local->client_conns_lock); - chan = 0; - -found_channel: - _debug("found chan"); - call->conn = conn; - call->channel = chan; - call->epoch = conn->proto.epoch; - call->cid = conn->proto.cid | chan; - call->call_id = ++conn->channels[chan].call_counter; - conn->channels[chan].call_id = call->call_id; - rcu_assign_pointer(conn->channels[chan].call, call); - - _net("CONNECT call %d on conn %d", call->debug_id, conn->debug_id); - - spin_unlock(&conn->channel_lock); - rxrpc_put_peer(cp->peer); - cp->peer = NULL; - _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); - return 0; - - /* We found a suitable connection already in existence. Discard any - * candidate we may have allocated, and try to get a channel on this - * one. - */ -found_extant_conn: - _debug("found conn"); - rxrpc_get_connection(conn); - spin_unlock(&local->client_conns_lock); - - rxrpc_put_connection(candidate); - - if (!atomic_add_unless(&conn->avail_chans, -1, 0)) { - if (!gfpflags_allow_blocking(gfp)) { - rxrpc_put_connection(conn); - _leave(" = -EAGAIN"); - return -EAGAIN; - } - - add_wait_queue(&conn->channel_wq, &myself); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (atomic_add_unless(&conn->avail_chans, -1, 0)) - break; - if (signal_pending(current)) - goto interrupted; - schedule(); - } - remove_wait_queue(&conn->channel_wq, &myself); - __set_current_state(TASK_RUNNING); - } - - /* The connection allegedly now has a free channel and we can now - * attach the call to it. - */ - spin_lock(&conn->channel_lock); - - for (chan = 0; chan < RXRPC_MAXCALLS; chan++) - if (!conn->channels[chan].call) - goto found_channel; - BUG(); - -interrupted: - remove_wait_queue(&conn->channel_wq, &myself); - __set_current_state(TASK_RUNNING); - rxrpc_put_connection(conn); - rxrpc_put_peer(cp->peer); - cp->peer = NULL; - _leave(" = -ERESTARTSYS"); - return -ERESTARTSYS; -} - /* * get a record of an incoming connection */ -- cgit v1.2.3 From 7877a4a4bdf0d782276f1cba868878aee77718ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 4 Apr 2016 14:00:40 +0100 Subject: rxrpc: Split service connection code out into its own file Split the service-specific connection code out into into its own file. The client-specific code has already been split out. This will leave just the common code in the original file. Signed-off-by: David Howells --- net/rxrpc/Makefile | 1 + net/rxrpc/ar-internal.h | 13 ++++- net/rxrpc/conn_object.c | 132 ------------------------------------------ net/rxrpc/conn_service.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 156 insertions(+), 135 deletions(-) create mode 100644 net/rxrpc/conn_service.c (limited to 'net') diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 6522e50fb750..10f3f48a16a8 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -10,6 +10,7 @@ af-rxrpc-y := \ conn_client.o \ conn_event.o \ conn_object.o \ + conn_service.o \ input.o \ insecure.o \ key.o \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 021d28b54282..e1af258a7ac9 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -563,6 +563,9 @@ extern unsigned int rxrpc_connection_expiry; extern struct list_head rxrpc_connections; extern rwlock_t rxrpc_connection_lock; +void rxrpc_conn_hash_proto_key(struct rxrpc_conn_proto *); +void rxrpc_extract_conn_params(struct rxrpc_conn_proto *, + struct rxrpc_local *, struct sk_buff *); struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *, struct rxrpc_peer *, @@ -571,9 +574,6 @@ void __rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_put_connection(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); -struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *, - struct rxrpc_peer *, - struct sk_buff *); static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) { @@ -603,6 +603,13 @@ static inline void rxrpc_queue_conn(struct rxrpc_connection *conn) rxrpc_put_connection(conn); } +/* + * conn_service.c + */ +struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *, + struct rxrpc_peer *, + struct sk_buff *); + /* * input.c */ diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index ab5c8c2960e4..8379e3748d13 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -60,138 +60,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) return conn; } -/* - * get a record of an incoming connection - */ -struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, - struct rxrpc_peer *peer, - struct sk_buff *skb) -{ - struct rxrpc_connection *conn, *candidate = NULL; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rb_node *p, **pp; - const char *new = "old"; - u32 epoch, cid; - - _enter(""); - - ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED); - - epoch = sp->hdr.epoch; - cid = sp->hdr.cid & RXRPC_CIDMASK; - - /* search the connection list first */ - read_lock_bh(&peer->conn_lock); - - p = peer->service_conns.rb_node; - while (p) { - conn = rb_entry(p, struct rxrpc_connection, service_node); - - _debug("maybe %x", conn->proto.cid); - - if (epoch < conn->proto.epoch) - p = p->rb_left; - else if (epoch > conn->proto.epoch) - p = p->rb_right; - else if (cid < conn->proto.cid) - p = p->rb_left; - else if (cid > conn->proto.cid) - p = p->rb_right; - else - goto found_extant_connection; - } - read_unlock_bh(&peer->conn_lock); - - /* not yet present - create a candidate for a new record and then - * redo the search */ - candidate = rxrpc_alloc_connection(GFP_NOIO); - if (!candidate) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } - - candidate->proto.local = local; - candidate->proto.epoch = sp->hdr.epoch; - candidate->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; - candidate->proto.in_clientflag = RXRPC_CLIENT_INITIATED; - candidate->params.local = local; - candidate->params.peer = peer; - candidate->params.service_id = sp->hdr.serviceId; - candidate->security_ix = sp->hdr.securityIndex; - candidate->out_clientflag = 0; - candidate->state = RXRPC_CONN_SERVICE; - if (candidate->params.service_id) - candidate->state = RXRPC_CONN_SERVICE_UNSECURED; - - write_lock_bh(&peer->conn_lock); - - pp = &peer->service_conns.rb_node; - p = NULL; - while (*pp) { - p = *pp; - conn = rb_entry(p, struct rxrpc_connection, service_node); - - if (epoch < conn->proto.epoch) - pp = &(*pp)->rb_left; - else if (epoch > conn->proto.epoch) - pp = &(*pp)->rb_right; - else if (cid < conn->proto.cid) - pp = &(*pp)->rb_left; - else if (cid > conn->proto.cid) - pp = &(*pp)->rb_right; - else - goto found_extant_second; - } - - /* we can now add the new candidate to the list */ - conn = candidate; - candidate = NULL; - rb_link_node(&conn->service_node, p, pp); - rb_insert_color(&conn->service_node, &peer->service_conns); - rxrpc_get_peer(peer); - rxrpc_get_local(local); - - write_unlock_bh(&peer->conn_lock); - - write_lock(&rxrpc_connection_lock); - list_add_tail(&conn->link, &rxrpc_connections); - write_unlock(&rxrpc_connection_lock); - - new = "new"; - -success: - _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid); - - _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); - return conn; - - /* we found the connection in the list immediately */ -found_extant_connection: - if (sp->hdr.securityIndex != conn->security_ix) { - read_unlock_bh(&peer->conn_lock); - goto security_mismatch; - } - rxrpc_get_connection(conn); - read_unlock_bh(&peer->conn_lock); - goto success; - - /* we found the connection on the second time through the list */ -found_extant_second: - if (sp->hdr.securityIndex != conn->security_ix) { - write_unlock_bh(&peer->conn_lock); - goto security_mismatch; - } - rxrpc_get_connection(conn); - write_unlock_bh(&peer->conn_lock); - kfree(candidate); - goto success; - -security_mismatch: - kfree(candidate); - _leave(" = -EKEYREJECTED"); - return ERR_PTR(-EKEYREJECTED); -} - /* * find a connection based on transport and RxRPC connection ID for an incoming * packet diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c new file mode 100644 index 000000000000..cdcac50cd1a8 --- /dev/null +++ b/net/rxrpc/conn_service.c @@ -0,0 +1,145 @@ +/* Service connection management + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include "ar-internal.h" + +/* + * get a record of an incoming connection + */ +struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, + struct rxrpc_peer *peer, + struct sk_buff *skb) +{ + struct rxrpc_connection *conn, *candidate = NULL; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rb_node *p, **pp; + const char *new = "old"; + u32 epoch, cid; + + _enter(""); + + ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED); + + epoch = sp->hdr.epoch; + cid = sp->hdr.cid & RXRPC_CIDMASK; + + /* search the connection list first */ + read_lock_bh(&peer->conn_lock); + + p = peer->service_conns.rb_node; + while (p) { + conn = rb_entry(p, struct rxrpc_connection, service_node); + + _debug("maybe %x", conn->proto.cid); + + if (epoch < conn->proto.epoch) + p = p->rb_left; + else if (epoch > conn->proto.epoch) + p = p->rb_right; + else if (cid < conn->proto.cid) + p = p->rb_left; + else if (cid > conn->proto.cid) + p = p->rb_right; + else + goto found_extant_connection; + } + read_unlock_bh(&peer->conn_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_connection(GFP_NOIO); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + candidate->proto.local = local; + candidate->proto.epoch = sp->hdr.epoch; + candidate->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; + candidate->proto.in_clientflag = RXRPC_CLIENT_INITIATED; + candidate->params.local = local; + candidate->params.peer = peer; + candidate->params.service_id = sp->hdr.serviceId; + candidate->security_ix = sp->hdr.securityIndex; + candidate->out_clientflag = 0; + candidate->state = RXRPC_CONN_SERVICE; + if (candidate->params.service_id) + candidate->state = RXRPC_CONN_SERVICE_UNSECURED; + + write_lock_bh(&peer->conn_lock); + + pp = &peer->service_conns.rb_node; + p = NULL; + while (*pp) { + p = *pp; + conn = rb_entry(p, struct rxrpc_connection, service_node); + + if (epoch < conn->proto.epoch) + pp = &(*pp)->rb_left; + else if (epoch > conn->proto.epoch) + pp = &(*pp)->rb_right; + else if (cid < conn->proto.cid) + pp = &(*pp)->rb_left; + else if (cid > conn->proto.cid) + pp = &(*pp)->rb_right; + else + goto found_extant_second; + } + + /* we can now add the new candidate to the list */ + conn = candidate; + candidate = NULL; + rb_link_node(&conn->service_node, p, pp); + rb_insert_color(&conn->service_node, &peer->service_conns); + rxrpc_get_peer(peer); + rxrpc_get_local(local); + + write_unlock_bh(&peer->conn_lock); + + write_lock(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + write_unlock(&rxrpc_connection_lock); + + new = "new"; + +success: + _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid); + + _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); + return conn; + + /* we found the connection in the list immediately */ +found_extant_connection: + if (sp->hdr.securityIndex != conn->security_ix) { + read_unlock_bh(&peer->conn_lock); + goto security_mismatch; + } + rxrpc_get_connection(conn); + read_unlock_bh(&peer->conn_lock); + goto success; + + /* we found the connection on the second time through the list */ +found_extant_second: + if (sp->hdr.securityIndex != conn->security_ix) { + write_unlock_bh(&peer->conn_lock); + goto security_mismatch; + } + rxrpc_get_connection(conn); + write_unlock_bh(&peer->conn_lock); + kfree(candidate); + goto success; + +security_mismatch: + kfree(candidate); + _leave(" = -EKEYREJECTED"); + return ERR_PTR(-EKEYREJECTED); +} -- cgit v1.2.3 From d991b4a32f65076efaf78739c4a46406ca8c7e79 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Jun 2016 14:40:39 +0100 Subject: rxrpc: Move peer lookup from call-accept to new-incoming-conn Move the lookup of a peer from a call that's being accepted into the function that creates a new incoming connection. This will allow us to avoid incrementing the peer's usage count in some cases in future. Note that I haven't bother to integrate rxrpc_get_addr_from_skb() with rxrpc_extract_addr_from_skb() as I'm going to delete the former in the very near future. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 3 ++- net/rxrpc/call_accept.c | 31 +++++++------------------------ net/rxrpc/conn_service.c | 11 ++++++++++- net/rxrpc/utils.c | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e1af258a7ac9..ad48f851b40c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -607,7 +607,7 @@ static inline void rxrpc_queue_conn(struct rxrpc_connection *conn) * conn_service.c */ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *, - struct rxrpc_peer *, + struct sockaddr_rxrpc *, struct sk_buff *); /* @@ -773,6 +773,7 @@ static inline void rxrpc_sysctl_exit(void) {} */ void rxrpc_get_addr_from_skb(struct rxrpc_local *, const struct sk_buff *, struct sockaddr_rxrpc *); +int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); /* * debug tracing diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 5367dbe9b96f..0b2832141bd0 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -75,7 +75,6 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, { struct rxrpc_connection *conn; struct rxrpc_skb_priv *sp, *nsp; - struct rxrpc_peer *peer; struct rxrpc_call *call; struct sk_buff *notification; int ret; @@ -94,15 +93,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, rxrpc_new_skb(notification); notification->mark = RXRPC_SKB_MARK_NEW_CALL; - peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); - if (!peer) { - _debug("no peer"); - ret = -EBUSY; - goto error; - } - - conn = rxrpc_incoming_connection(local, peer, skb); - rxrpc_put_peer(peer); + conn = rxrpc_incoming_connection(local, srx, skb); if (IS_ERR(conn)) { _debug("no conn"); ret = PTR_ERR(conn); @@ -226,20 +217,8 @@ void rxrpc_accept_incoming_calls(struct rxrpc_local *local) whdr._rsvd = 0; whdr.serviceId = htons(sp->hdr.serviceId); - /* determine the remote address */ - memset(&srx, 0, sizeof(srx)); - srx.srx_family = AF_RXRPC; - srx.transport.family = local->srx.transport.family; - srx.transport_type = local->srx.transport_type; - switch (srx.transport.family) { - case AF_INET: - srx.transport_len = sizeof(struct sockaddr_in); - srx.transport.sin.sin_port = udp_hdr(skb)->source; - srx.transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; - break; - default: - goto busy; - } + if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) + goto drop; /* get the socket providing the service */ read_lock_bh(&local->services_lock); @@ -285,6 +264,10 @@ busy: rxrpc_free_skb(skb); return; +drop: + rxrpc_free_skb(skb); + return; + invalid_service: skb->priority = RX_INVALID_OPERATION; rxrpc_reject_packet(local, skb); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index cdcac50cd1a8..a42b210c40a5 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -16,17 +16,24 @@ * get a record of an incoming connection */ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, - struct rxrpc_peer *peer, + struct sockaddr_rxrpc *srx, struct sk_buff *skb) { struct rxrpc_connection *conn, *candidate = NULL; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_peer *peer; struct rb_node *p, **pp; const char *new = "old"; u32 epoch, cid; _enter(""); + peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); + if (!peer) { + _debug("no peer"); + return ERR_PTR(-EBUSY); + } + ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED); epoch = sp->hdr.epoch; @@ -58,6 +65,7 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, * redo the search */ candidate = rxrpc_alloc_connection(GFP_NOIO); if (!candidate) { + rxrpc_put_peer(peer); _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } @@ -114,6 +122,7 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, success: _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid); + rxrpc_put_peer(peer); _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); return conn; diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c index f28122a15a24..d3db02ecc37f 100644 --- a/net/rxrpc/utils.c +++ b/net/rxrpc/utils.c @@ -10,6 +10,7 @@ */ #include +#include #include #include "ar-internal.h" @@ -39,3 +40,34 @@ void rxrpc_get_addr_from_skb(struct rxrpc_local *local, BUG(); } } + +/* + * Fill out a peer address from a socket buffer containing a packet. + */ +int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb) +{ + memset(srx, 0, sizeof(*srx)); + + switch (ntohs(skb->protocol)) { + case ETH_P_IP: + srx->transport_type = SOCK_DGRAM; + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.sin.sin_family = AF_INET; + srx->transport.sin.sin_port = udp_hdr(skb)->source; + srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + return 0; + + case ETH_P_IPV6: + srx->transport_type = SOCK_DGRAM; + srx->transport_len = sizeof(srx->transport.sin6); + srx->transport.sin6.sin6_family = AF_INET6; + srx->transport.sin6.sin6_port = udp_hdr(skb)->source; + srx->transport.sin6.sin6_addr = ipv6_hdr(skb)->saddr; + return 0; + + default: + pr_warn_ratelimited("AF_RXRPC: Unknown eth protocol %u\n", + ntohs(skb->protocol)); + return -EAFNOSUPPORT; + } +} -- cgit v1.2.3 From 001c11224910b25e59a65ce1b49cfecdb4c631c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 Jun 2016 10:45:22 +0100 Subject: rxrpc: Maintain an extra ref on a conn for the cache list Overhaul the usage count accounting for the rxrpc_connection struct to make it easier to implement RCU access from the data_ready handler. The problem is that currently we're using a lock to prevent the garbage collector from trying to clean up a connection that we're contemplating unidling. We could just stick incoming packets on the connection we find, but we've then got a problem that we may race when dispatching a work item to process it as we need to give that a ref to prevent the rxrpc_connection struct from disappearing in the meantime. Further, incoming packets may get discarded if attached to an rxrpc_connection struct that is going away. Whilst this is not a total disaster - the client will presumably resend - it would delay processing of the call. This would affect the AFS client filesystem's service manager operation. To this end: (1) We now maintain an extra count on the connection usage count whilst it is on the connection list. This mean it is not in use when its refcount is 1. (2) When trying to reuse an old connection, we only increment the refcount if it is greater than 0. If it is 0, we replace it in the tree with a new candidate connection. (3) Two connection flags are added to indicate whether or not a connection is in the local's client connection tree (used by sendmsg) or the peer's service connection tree (used by data_ready). This makes sure that we don't try and remove a connection if it got replaced. The flags are tested under lock with the removal operation to prevent the reaper from killing the rxrpc_connection struct whilst someone else is trying to effect a replacement. This could probably be alleviated by using memory barriers between the flag set/test and the rb_tree ops. The rb_tree op would still need to be under the lock, however. (4) When trying to reap an old connection, we try to flip the usage count from 1 to 0. If it's not 1 at that point, then it must've come back to life temporarily and we ignore it. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 5 +++- net/rxrpc/conn_client.c | 42 +++++++++++++++++++++------ net/rxrpc/conn_object.c | 74 +++++++++++++++++++----------------------------- net/rxrpc/conn_service.c | 34 +++++++++++++++++++--- 4 files changed, 97 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ad48f851b40c..d8e4d6e6a030 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -258,6 +258,8 @@ struct rxrpc_conn_parameters { */ enum rxrpc_conn_flag { RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */ + RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */ + RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */ }; /* @@ -544,10 +546,10 @@ void __exit rxrpc_destroy_all_calls(void); */ extern struct idr rxrpc_client_conn_ids; -void rxrpc_put_client_connection_id(struct rxrpc_connection *); void rxrpc_destroy_client_conn_ids(void); int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, gfp_t); +void rxrpc_unpublish_client_conn(struct rxrpc_connection *); /* * conn_event.c @@ -609,6 +611,7 @@ static inline void rxrpc_queue_conn(struct rxrpc_connection *conn) struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *, struct sockaddr_rxrpc *, struct sk_buff *); +void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* * input.c diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 9180164a51aa..aa21462f3236 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -84,7 +84,7 @@ error: /* * Release a connection ID for a client connection from the global pool. */ -void rxrpc_put_client_connection_id(struct rxrpc_connection *conn) +static void rxrpc_put_client_connection_id(struct rxrpc_connection *conn) { if (test_bit(RXRPC_CONN_HAS_IDR, &conn->flags)) { spin_lock(&rxrpc_conn_id_lock); @@ -278,12 +278,13 @@ int rxrpc_connect_call(struct rxrpc_call *call, * lock before dropping the client conn lock. */ _debug("new conn"); + set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); + rb_link_node(&candidate->client_node, parent, pp); + rb_insert_color(&candidate->client_node, &local->client_conns); +attached: conn = candidate; candidate = NULL; - rb_link_node(&conn->client_node, parent, pp); - rb_insert_color(&conn->client_node, &local->client_conns); - atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); spin_lock(&conn->channel_lock); spin_unlock(&local->client_conns_lock); @@ -307,13 +308,22 @@ found_channel: _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); return 0; - /* We found a suitable connection already in existence. Discard any - * candidate we may have allocated, and try to get a channel on this - * one. + /* We found a potentially suitable connection already in existence. If + * we can reuse it (ie. its usage count hasn't been reduced to 0 by the + * reaper), discard any candidate we may have allocated, and try to get + * a channel on this one, otherwise we have to replace it. */ found_extant_conn: _debug("found conn"); - rxrpc_get_connection(conn); + if (!rxrpc_get_connection_maybe(conn)) { + set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); + rb_replace_node(&conn->client_node, + &candidate->client_node, + &local->client_conns); + clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags); + goto attached; + } + spin_unlock(&local->client_conns_lock); rxrpc_put_connection(candidate); @@ -357,3 +367,19 @@ interrupted: _leave(" = -ERESTARTSYS"); return -ERESTARTSYS; } + +/* + * Remove a client connection from the local endpoint's tree, thereby removing + * it as a target for reuse for new client calls. + */ +void rxrpc_unpublish_client_conn(struct rxrpc_connection *conn) +{ + struct rxrpc_local *local = conn->params.local; + + spin_lock(&local->client_conns_lock); + if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) + rb_erase(&conn->client_node, &local->client_conns); + spin_unlock(&local->client_conns_lock); + + rxrpc_put_client_connection_id(conn); +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 8379e3748d13..89bc6480b4e2 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -49,7 +49,10 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) skb_queue_head_init(&conn->rx_queue); conn->security = &rxrpc_no_security; spin_lock_init(&conn->state_lock); - atomic_set(&conn->usage, 1); + /* We maintain an extra ref on the connection whilst it is + * on the rxrpc_connections list. + */ + atomic_set(&conn->usage, 2); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); atomic_set(&conn->avail_chans, RXRPC_MAXCALLS); conn->size_align = 4; @@ -111,7 +114,7 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *local, return NULL; found: - rxrpc_get_connection(conn); + conn = rxrpc_get_connection_maybe(conn); read_unlock_bh(&peer->conn_lock); _leave(" = %p", conn); return conn; @@ -173,10 +176,10 @@ void rxrpc_put_connection(struct rxrpc_connection *conn) _enter("%p{u=%d,d=%d}", conn, atomic_read(&conn->usage), conn->debug_id); - ASSERTCMP(atomic_read(&conn->usage), >, 0); + ASSERTCMP(atomic_read(&conn->usage), >, 1); conn->put_time = ktime_get_seconds(); - if (atomic_dec_and_test(&conn->usage)) { + if (atomic_dec_return(&conn->usage) == 1) { _debug("zombie"); rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); } @@ -216,59 +219,41 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) static void rxrpc_connection_reaper(struct work_struct *work) { struct rxrpc_connection *conn, *_p; - struct rxrpc_peer *peer; - unsigned long now, earliest, reap_time; + unsigned long reap_older_than, earliest, put_time, now; LIST_HEAD(graveyard); _enter(""); now = ktime_get_seconds(); + reap_older_than = now - rxrpc_connection_expiry; earliest = ULONG_MAX; write_lock(&rxrpc_connection_lock); list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) { - _debug("reap CONN %d { u=%d,t=%ld }", - conn->debug_id, atomic_read(&conn->usage), - (long) now - (long) conn->put_time); - - if (likely(atomic_read(&conn->usage) > 0)) + ASSERTCMP(atomic_read(&conn->usage), >, 0); + if (likely(atomic_read(&conn->usage) > 1)) continue; - if (rxrpc_conn_is_client(conn)) { - struct rxrpc_local *local = conn->params.local; - spin_lock(&local->client_conns_lock); - reap_time = conn->put_time + rxrpc_connection_expiry; - - if (atomic_read(&conn->usage) > 0) { - ; - } else if (reap_time <= now) { - list_move_tail(&conn->link, &graveyard); - rxrpc_put_client_connection_id(conn); - rb_erase(&conn->client_node, - &local->client_conns); - } else if (reap_time < earliest) { - earliest = reap_time; - } - - spin_unlock(&local->client_conns_lock); - } else { - peer = conn->params.peer; - write_lock_bh(&peer->conn_lock); - reap_time = conn->put_time + rxrpc_connection_expiry; - - if (atomic_read(&conn->usage) > 0) { - ; - } else if (reap_time <= now) { - list_move_tail(&conn->link, &graveyard); - rb_erase(&conn->service_node, - &peer->service_conns); - } else if (reap_time < earliest) { - earliest = reap_time; - } - - write_unlock_bh(&peer->conn_lock); + put_time = READ_ONCE(conn->put_time); + if (time_after(put_time, reap_older_than)) { + if (time_before(put_time, earliest)) + earliest = put_time; + continue; } + + /* The usage count sits at 1 whilst the object is unused on the + * list; we reduce that to 0 to make the object unavailable. + */ + if (atomic_cmpxchg(&conn->usage, 1, 0) != 1) + continue; + + if (rxrpc_conn_is_client(conn)) + rxrpc_unpublish_client_conn(conn); + else + rxrpc_unpublish_service_conn(conn); + + list_move_tail(&conn->link, &graveyard); } write_unlock(&rxrpc_connection_lock); @@ -279,7 +264,6 @@ static void rxrpc_connection_reaper(struct work_struct *work) (earliest - now) * HZ); } - /* then destroy all those pulled out */ while (!list_empty(&graveyard)) { conn = list_entry(graveyard.next, struct rxrpc_connection, link); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index a42b210c40a5..77a509e6003a 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -104,10 +104,12 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, } /* we can now add the new candidate to the list */ + set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &candidate->flags); + rb_link_node(&candidate->service_node, p, pp); + rb_insert_color(&candidate->service_node, &peer->service_conns); +attached: conn = candidate; candidate = NULL; - rb_link_node(&conn->service_node, p, pp); - rb_insert_color(&conn->service_node, &peer->service_conns); rxrpc_get_peer(peer); rxrpc_get_local(local); @@ -128,11 +130,19 @@ success: /* we found the connection in the list immediately */ found_extant_connection: + if (!rxrpc_get_connection_maybe(conn)) { + set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &candidate->flags); + rb_replace_node(&conn->service_node, + &candidate->service_node, + &peer->service_conns); + clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags); + goto attached; + } + if (sp->hdr.securityIndex != conn->security_ix) { read_unlock_bh(&peer->conn_lock); - goto security_mismatch; + goto security_mismatch_put; } - rxrpc_get_connection(conn); read_unlock_bh(&peer->conn_lock); goto success; @@ -147,8 +157,24 @@ found_extant_second: kfree(candidate); goto success; +security_mismatch_put: + rxrpc_put_connection(conn); security_mismatch: kfree(candidate); _leave(" = -EKEYREJECTED"); return ERR_PTR(-EKEYREJECTED); } + +/* + * Remove the service connection from the peer's tree, thereby removing it as a + * target for incoming packets. + */ +void rxrpc_unpublish_service_conn(struct rxrpc_connection *conn) +{ + struct rxrpc_peer *peer = conn->params.peer; + + write_lock_bh(&peer->conn_lock); + if (test_and_clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags)) + rb_erase(&conn->service_node, &peer->service_conns); + write_unlock_bh(&peer->conn_lock); +} -- cgit v1.2.3 From e8d70ce177eeb4fbd1c218c60118d2c19c2496a6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 Jun 2016 12:16:21 +0100 Subject: rxrpc: Prune the contents of the rxrpc_conn_proto struct Prune the contents of the rxrpc_conn_proto struct. Most of the fields aren't used anymore. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 20 +++++++------------- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_client.c | 11 ----------- net/rxrpc/conn_service.c | 2 -- 4 files changed, 8 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d8e4d6e6a030..6fdee761dd0b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -229,18 +229,12 @@ struct rxrpc_peer { * Keys for matching a connection. */ struct rxrpc_conn_proto { - unsigned long hash_key; - struct rxrpc_local *local; /* Representation of local endpoint */ - u32 epoch; /* epoch of this connection */ - u32 cid; /* connection ID */ - u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */ - u8 addr_size; /* Size of the address */ - sa_family_t family; /* Transport protocol */ - __be16 port; /* Peer UDP/UDP6 port */ - union { /* Peer address */ - struct in_addr ipv4_addr; - struct in6_addr ipv6_addr; - u32 raw_addr[0]; + union { + struct { + u32 epoch; /* epoch of this connection */ + u32 cid; /* connection ID */ + }; + u64 index_key; }; }; @@ -584,7 +578,7 @@ static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) { - return conn->proto.in_clientflag; + return !rxrpc_conn_is_client(conn); } static inline void rxrpc_get_connection(struct rxrpc_connection *conn) diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 3f278721269e..ebbd7dd5292f 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -566,7 +566,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, } call->epoch = conn->proto.epoch; call->service_id = conn->params.service_id; - call->in_clientflag = conn->proto.in_clientflag; + call->in_clientflag = RXRPC_CLIENT_INITIATED; /* Add the new call to the hashtable */ rxrpc_call_hash_add(call); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index aa21462f3236..917db48d7f59 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -132,22 +132,11 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) } conn->params = *cp; - conn->proto.local = cp->local; conn->proto.epoch = rxrpc_epoch; conn->proto.cid = 0; - conn->proto.in_clientflag = 0; - conn->proto.family = cp->peer->srx.transport.family; conn->out_clientflag = RXRPC_CLIENT_INITIATED; conn->state = RXRPC_CONN_CLIENT; - switch (conn->proto.family) { - case AF_INET: - conn->proto.addr_size = sizeof(conn->proto.ipv4_addr); - conn->proto.ipv4_addr = cp->peer->srx.transport.sin.sin_addr; - conn->proto.port = cp->peer->srx.transport.sin.sin_port; - break; - } - ret = rxrpc_get_client_connection_id(conn, gfp); if (ret < 0) goto error_0; diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 77a509e6003a..c6db2e8400a2 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -70,10 +70,8 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, return ERR_PTR(-ENOMEM); } - candidate->proto.local = local; candidate->proto.epoch = sp->hdr.epoch; candidate->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; - candidate->proto.in_clientflag = RXRPC_CLIENT_INITIATED; candidate->params.local = local; candidate->params.peer = peer; candidate->params.service_id = sp->hdr.serviceId; -- cgit v1.2.3 From 1291e9d1084506c5cba6313ce809d7516bb5868a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 Jun 2016 12:02:53 +0100 Subject: rxrpc: Move data_ready peer lookup into rxrpc_find_connection() Move the peer lookup done in input.c by data_ready into rxrpc_find_connection(). Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 3 -- net/rxrpc/conn_object.c | 73 +++++++++++++++++++++++++++++++++++++------------ net/rxrpc/input.c | 30 ++------------------ net/rxrpc/utils.c | 27 ------------------ 4 files changed, 59 insertions(+), 74 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6fdee761dd0b..0fe63baf1286 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -564,7 +564,6 @@ void rxrpc_extract_conn_params(struct rxrpc_conn_proto *, struct rxrpc_local *, struct sk_buff *); struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *, - struct rxrpc_peer *, struct sk_buff *); void __rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); @@ -768,8 +767,6 @@ static inline void rxrpc_sysctl_exit(void) {} /* * utils.c */ -void rxrpc_get_addr_from_skb(struct rxrpc_local *, const struct sk_buff *, - struct sockaddr_rxrpc *); int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); /* diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 89bc6480b4e2..130713869a16 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -68,52 +68,91 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) * packet */ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *local, - struct rxrpc_peer *peer, struct sk_buff *skb) { struct rxrpc_connection *conn; + struct rxrpc_conn_proto k; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct sockaddr_rxrpc srx; + struct rxrpc_peer *peer; struct rb_node *p; - u32 epoch, cid; _enter(",{%x,%x}", sp->hdr.cid, sp->hdr.flags); - read_lock_bh(&peer->conn_lock); + if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) + goto not_found; - cid = sp->hdr.cid & RXRPC_CIDMASK; - epoch = sp->hdr.epoch; + /* We may have to handle mixing IPv4 and IPv6 */ + if (srx.transport.family != local->srx.transport.family) { + pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", + srx.transport.family, + local->srx.transport.family); + goto not_found; + } + + k.epoch = sp->hdr.epoch; + k.cid = sp->hdr.cid & RXRPC_CIDMASK; if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) { + /* We need to look up service connections by the full protocol + * parameter set. We look up the peer first as an intermediate + * step and then the connection from the peer's tree. + */ + peer = rxrpc_lookup_peer_rcu(local, &srx); + if (!peer) + goto not_found; + + read_lock_bh(&peer->conn_lock); + p = peer->service_conns.rb_node; while (p) { conn = rb_entry(p, struct rxrpc_connection, service_node); _debug("maybe %x", conn->proto.cid); - if (epoch < conn->proto.epoch) + if (k.epoch < conn->proto.epoch) p = p->rb_left; - else if (epoch > conn->proto.epoch) + else if (k.epoch > conn->proto.epoch) p = p->rb_right; - else if (cid < conn->proto.cid) + else if (k.cid < conn->proto.cid) p = p->rb_left; - else if (cid > conn->proto.cid) + else if (k.cid > conn->proto.cid) p = p->rb_right; else - goto found; + goto found_service_conn; } + read_unlock_bh(&peer->conn_lock); } else { - conn = idr_find(&rxrpc_client_conn_ids, cid >> RXRPC_CIDSHIFT); - if (conn && - conn->proto.epoch == epoch && - conn->params.peer == peer) - goto found; + conn = idr_find(&rxrpc_client_conn_ids, + k.cid >> RXRPC_CIDSHIFT); + if (!conn || + conn->proto.epoch != k.epoch || + conn->params.local != local) + goto not_found; + + peer = conn->params.peer; + switch (srx.transport.family) { + case AF_INET: + if (peer->srx.transport.sin.sin_port != + srx.transport.sin.sin_port || + peer->srx.transport.sin.sin_addr.s_addr != + srx.transport.sin.sin_addr.s_addr) + goto not_found; + break; + default: + BUG(); + } + + conn = rxrpc_get_connection_maybe(conn); + _leave(" = %p", conn); + return conn; } - read_unlock_bh(&peer->conn_lock); +not_found: _leave(" = NULL"); return NULL; -found: +found_service_conn: conn = rxrpc_get_connection_maybe(conn); read_unlock_bh(&peer->conn_lock); _leave(" = %p", conn); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index b993f2dc5a09..c2436476f793 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -626,32 +626,6 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) return 0; } -static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, - struct sk_buff *skb) -{ - struct rxrpc_peer *peer; - struct rxrpc_connection *conn; - struct sockaddr_rxrpc srx; - - rxrpc_get_addr_from_skb(local, skb, &srx); - rcu_read_lock(); - peer = rxrpc_lookup_peer_rcu(local, &srx); - if (!peer) - goto cant_find_peer; - - conn = rxrpc_find_connection(local, peer, skb); - rcu_read_unlock(); - if (!conn) - goto cant_find_conn; - - return conn; - -cant_find_peer: - rcu_read_unlock(); -cant_find_conn: - return NULL; -} - /* * handle data received on the local endpoint * - may be called in interrupt context @@ -731,7 +705,9 @@ void rxrpc_data_ready(struct sock *sk) * old-fashioned way doesn't really hurt */ struct rxrpc_connection *conn; - conn = rxrpc_conn_from_local(local, skb); + rcu_read_lock(); + conn = rxrpc_find_connection(local, skb); + rcu_read_unlock(); if (!conn) goto cant_route_call; diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c index d3db02ecc37f..b88914d53ca5 100644 --- a/net/rxrpc/utils.c +++ b/net/rxrpc/utils.c @@ -14,33 +14,6 @@ #include #include "ar-internal.h" -/* - * Set up an RxRPC address from a socket buffer. - */ -void rxrpc_get_addr_from_skb(struct rxrpc_local *local, - const struct sk_buff *skb, - struct sockaddr_rxrpc *srx) -{ - memset(srx, 0, sizeof(*srx)); - srx->transport_type = local->srx.transport_type; - srx->transport.family = local->srx.transport.family; - - /* Can we see an ipv4 UDP packet on an ipv6 UDP socket? and vice - * versa? - */ - switch (srx->transport.family) { - case AF_INET: - srx->transport.sin.sin_port = udp_hdr(skb)->source; - srx->transport_len = sizeof(struct sockaddr_in); - memcpy(&srx->transport.sin.sin_addr, &ip_hdr(skb)->saddr, - sizeof(struct in_addr)); - break; - - default: - BUG(); - } -} - /* * Fill out a peer address from a socket buffer containing a packet. */ -- cgit v1.2.3 From 8496af50eb385c1cadff9ad396fd5359e96b6c27 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 1 Jul 2016 07:51:50 +0100 Subject: rxrpc: Use RCU to access a peer's service connection tree Move to using RCU access to a peer's service connection tree when routing an incoming packet. This is done using a seqlock to trigger retrying of the tree walk if a change happened. Further, we no longer get a ref on the connection looked up in the data_ready handler unless we queue the connection's work item - and then only if the refcount > 0. Note that I'm avoiding the use of a hash table for service connections because each service connection is addressed by a 62-bit number (constructed from epoch and connection ID >> 2) that would allow the client to engage in bucket stuffing, given knowledge of the hash algorithm. Peers, however, are hashed as the network address is less controllable by the client. The total number of peers will also be limited in a future commit. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 21 ++-- net/rxrpc/conn_client.c | 2 - net/rxrpc/conn_object.c | 65 +++++------- net/rxrpc/conn_service.c | 270 ++++++++++++++++++++++++++++------------------- net/rxrpc/input.c | 44 ++++---- net/rxrpc/peer_object.c | 2 +- 6 files changed, 224 insertions(+), 180 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 0fe63baf1286..e292276c8539 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -206,7 +207,7 @@ struct rxrpc_peer { struct hlist_head error_targets; /* targets for net error distribution */ struct work_struct error_distributor; struct rb_root service_conns; /* Service connections */ - rwlock_t conn_lock; + seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ unsigned int if_mtu; /* interface MTU for this peer */ unsigned int mtu; /* network MTU for this peer */ @@ -559,12 +560,10 @@ extern unsigned int rxrpc_connection_expiry; extern struct list_head rxrpc_connections; extern rwlock_t rxrpc_connection_lock; -void rxrpc_conn_hash_proto_key(struct rxrpc_conn_proto *); -void rxrpc_extract_conn_params(struct rxrpc_conn_proto *, - struct rxrpc_local *, struct sk_buff *); +int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); -struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *, - struct sk_buff *); +struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, + struct sk_buff *); void __rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_put_connection(struct rxrpc_connection *); @@ -591,16 +590,20 @@ struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *con return atomic_inc_not_zero(&conn->usage) ? conn : NULL; } -static inline void rxrpc_queue_conn(struct rxrpc_connection *conn) +static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn) { - if (rxrpc_get_connection_maybe(conn) && - !rxrpc_queue_work(&conn->processor)) + if (!rxrpc_get_connection_maybe(conn)) + return false; + if (!rxrpc_queue_work(&conn->processor)) rxrpc_put_connection(conn); + return true; } /* * conn_service.c */ +struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, + struct sk_buff *); struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *, struct sockaddr_rxrpc *, struct sk_buff *); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 917db48d7f59..9e91f27b0d0f 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -132,8 +132,6 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) } conn->params = *cp; - conn->proto.epoch = rxrpc_epoch; - conn->proto.cid = 0; conn->out_clientflag = RXRPC_CLIENT_INITIATED; conn->state = RXRPC_CONN_CLIENT; diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 130713869a16..896d84493a05 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include "ar-internal.h" @@ -64,24 +63,30 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) } /* - * find a connection based on transport and RxRPC connection ID for an incoming - * packet + * Look up a connection in the cache by protocol parameters. + * + * If successful, a pointer to the connection is returned, but no ref is taken. + * NULL is returned if there is no match. + * + * The caller must be holding the RCU read lock. */ -struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *local, - struct sk_buff *skb) +struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, + struct sk_buff *skb) { struct rxrpc_connection *conn; struct rxrpc_conn_proto k; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct sockaddr_rxrpc srx; struct rxrpc_peer *peer; - struct rb_node *p; - _enter(",{%x,%x}", sp->hdr.cid, sp->hdr.flags); + _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK); if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) goto not_found; + k.epoch = sp->hdr.epoch; + k.cid = sp->hdr.cid & RXRPC_CIDMASK; + /* We may have to handle mixing IPv4 and IPv6 */ if (srx.transport.family != local->srx.transport.family) { pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", @@ -101,32 +106,23 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *local, peer = rxrpc_lookup_peer_rcu(local, &srx); if (!peer) goto not_found; - - read_lock_bh(&peer->conn_lock); - - p = peer->service_conns.rb_node; - while (p) { - conn = rb_entry(p, struct rxrpc_connection, service_node); - - _debug("maybe %x", conn->proto.cid); - - if (k.epoch < conn->proto.epoch) - p = p->rb_left; - else if (k.epoch > conn->proto.epoch) - p = p->rb_right; - else if (k.cid < conn->proto.cid) - p = p->rb_left; - else if (k.cid > conn->proto.cid) - p = p->rb_right; - else - goto found_service_conn; - } - read_unlock_bh(&peer->conn_lock); + conn = rxrpc_find_service_conn_rcu(peer, skb); + if (!conn || atomic_read(&conn->usage) == 0) + goto not_found; + _leave(" = %p", conn); + return conn; } else { + /* Look up client connections by connection ID alone as their + * IDs are unique for this machine. + */ conn = idr_find(&rxrpc_client_conn_ids, - k.cid >> RXRPC_CIDSHIFT); - if (!conn || - conn->proto.epoch != k.epoch || + sp->hdr.cid >> RXRPC_CIDSHIFT); + if (!conn || atomic_read(&conn->usage) == 0) { + _debug("no conn"); + goto not_found; + } + + if (conn->proto.epoch != k.epoch || conn->params.local != local) goto not_found; @@ -143,7 +139,6 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *local, BUG(); } - conn = rxrpc_get_connection_maybe(conn); _leave(" = %p", conn); return conn; } @@ -151,12 +146,6 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_local *local, not_found: _leave(" = NULL"); return NULL; - -found_service_conn: - conn = rxrpc_get_connection_maybe(conn); - read_unlock_bh(&peer->conn_lock); - _leave(" = %p", conn); - return conn; } /* diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index c6db2e8400a2..7cbd612be0d7 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -12,6 +12,112 @@ #include #include "ar-internal.h" +/* + * Find a service connection under RCU conditions. + * + * We could use a hash table, but that is subject to bucket stuffing by an + * attacker as the client gets to pick the epoch and cid values and would know + * the hash function. So, instead, we use a hash table for the peer and from + * that an rbtree to find the service connection. Under ordinary circumstances + * it might be slower than a large hash table, but it is at least limited in + * depth. + */ +struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *peer, + struct sk_buff *skb) +{ + struct rxrpc_connection *conn = NULL; + struct rxrpc_conn_proto k; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rb_node *p; + unsigned int seq = 0; + + k.epoch = sp->hdr.epoch; + k.cid = sp->hdr.cid & RXRPC_CIDMASK; + + do { + /* Unfortunately, rbtree walking doesn't give reliable results + * under just the RCU read lock, so we have to check for + * changes. + */ + read_seqbegin_or_lock(&peer->service_conn_lock, &seq); + + p = rcu_dereference_raw(peer->service_conns.rb_node); + while (p) { + conn = rb_entry(p, struct rxrpc_connection, service_node); + + if (conn->proto.index_key < k.index_key) + p = rcu_dereference_raw(p->rb_left); + else if (conn->proto.index_key > k.index_key) + p = rcu_dereference_raw(p->rb_right); + else + goto done; + conn = NULL; + } + } while (need_seqretry(&peer->service_conn_lock, seq)); + +done: + done_seqretry(&peer->service_conn_lock, seq); + _leave(" = %d", conn ? conn->debug_id : -1); + return conn; +} + +/* + * Insert a service connection into a peer's tree, thereby making it a target + * for incoming packets. + */ +static struct rxrpc_connection * +rxrpc_publish_service_conn(struct rxrpc_peer *peer, + struct rxrpc_connection *conn) +{ + struct rxrpc_connection *cursor = NULL; + struct rxrpc_conn_proto k = conn->proto; + struct rb_node **pp, *parent; + + write_seqlock_bh(&peer->service_conn_lock); + + pp = &peer->service_conns.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + cursor = rb_entry(parent, + struct rxrpc_connection, service_node); + + if (cursor->proto.index_key < k.index_key) + pp = &(*pp)->rb_left; + else if (cursor->proto.index_key > k.index_key) + pp = &(*pp)->rb_right; + else + goto found_extant_conn; + } + + rb_link_node_rcu(&conn->service_node, parent, pp); + rb_insert_color(&conn->service_node, &peer->service_conns); +conn_published: + set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags); + write_sequnlock_bh(&peer->service_conn_lock); + _leave(" = %d [new]", conn->debug_id); + return conn; + +found_extant_conn: + if (atomic_read(&cursor->usage) == 0) + goto replace_old_connection; + write_sequnlock_bh(&peer->service_conn_lock); + /* We should not be able to get here. rxrpc_incoming_connection() is + * called in a non-reentrant context, so there can't be a race to + * insert a new connection. + */ + BUG(); + +replace_old_connection: + /* The old connection is from an outdated epoch. */ + _debug("replace conn"); + rb_replace_node_rcu(&cursor->service_node, + &conn->service_node, + &peer->service_conns); + clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &cursor->flags); + goto conn_published; +} + /* * get a record of an incoming connection */ @@ -19,12 +125,10 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, struct sk_buff *skb) { - struct rxrpc_connection *conn, *candidate = NULL; + struct rxrpc_connection *conn; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_peer *peer; - struct rb_node *p, **pp; const char *new = "old"; - u32 epoch, cid; _enter(""); @@ -36,131 +140,79 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED); - epoch = sp->hdr.epoch; - cid = sp->hdr.cid & RXRPC_CIDMASK; - - /* search the connection list first */ - read_lock_bh(&peer->conn_lock); - - p = peer->service_conns.rb_node; - while (p) { - conn = rb_entry(p, struct rxrpc_connection, service_node); - - _debug("maybe %x", conn->proto.cid); - - if (epoch < conn->proto.epoch) - p = p->rb_left; - else if (epoch > conn->proto.epoch) - p = p->rb_right; - else if (cid < conn->proto.cid) - p = p->rb_left; - else if (cid > conn->proto.cid) - p = p->rb_right; - else - goto found_extant_connection; + rcu_read_lock(); + peer = rxrpc_lookup_peer_rcu(local, srx); + if (peer) { + conn = rxrpc_find_service_conn_rcu(peer, skb); + if (conn) { + if (sp->hdr.securityIndex != conn->security_ix) + goto security_mismatch_rcu; + if (rxrpc_get_connection_maybe(conn)) + goto found_extant_connection_rcu; + + /* The conn has expired but we can't remove it without + * the appropriate lock, so we attempt to replace it + * when we have a new candidate. + */ + } + + if (!rxrpc_get_peer_maybe(peer)) + peer = NULL; } - read_unlock_bh(&peer->conn_lock); - - /* not yet present - create a candidate for a new record and then - * redo the search */ - candidate = rxrpc_alloc_connection(GFP_NOIO); - if (!candidate) { - rxrpc_put_peer(peer); - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } - - candidate->proto.epoch = sp->hdr.epoch; - candidate->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; - candidate->params.local = local; - candidate->params.peer = peer; - candidate->params.service_id = sp->hdr.serviceId; - candidate->security_ix = sp->hdr.securityIndex; - candidate->out_clientflag = 0; - candidate->state = RXRPC_CONN_SERVICE; - if (candidate->params.service_id) - candidate->state = RXRPC_CONN_SERVICE_UNSECURED; - - write_lock_bh(&peer->conn_lock); - - pp = &peer->service_conns.rb_node; - p = NULL; - while (*pp) { - p = *pp; - conn = rb_entry(p, struct rxrpc_connection, service_node); + rcu_read_unlock(); - if (epoch < conn->proto.epoch) - pp = &(*pp)->rb_left; - else if (epoch > conn->proto.epoch) - pp = &(*pp)->rb_right; - else if (cid < conn->proto.cid) - pp = &(*pp)->rb_left; - else if (cid > conn->proto.cid) - pp = &(*pp)->rb_right; - else - goto found_extant_second; + if (!peer) { + peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); + if (IS_ERR(peer)) + goto enomem; } - /* we can now add the new candidate to the list */ - set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &candidate->flags); - rb_link_node(&candidate->service_node, p, pp); - rb_insert_color(&candidate->service_node, &peer->service_conns); -attached: - conn = candidate; - candidate = NULL; - rxrpc_get_peer(peer); - rxrpc_get_local(local); + /* We don't have a matching record yet. */ + conn = rxrpc_alloc_connection(GFP_NOIO); + if (!conn) + goto enomem_peer; + + conn->proto.epoch = sp->hdr.epoch; + conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; + conn->params.local = local; + conn->params.peer = peer; + conn->params.service_id = sp->hdr.serviceId; + conn->security_ix = sp->hdr.securityIndex; + conn->out_clientflag = 0; + conn->state = RXRPC_CONN_SERVICE; + if (conn->params.service_id) + conn->state = RXRPC_CONN_SERVICE_UNSECURED; - write_unlock_bh(&peer->conn_lock); + rxrpc_get_local(local); write_lock(&rxrpc_connection_lock); list_add_tail(&conn->link, &rxrpc_connections); write_unlock(&rxrpc_connection_lock); + /* Make the connection a target for incoming packets. */ + rxrpc_publish_service_conn(peer, conn); + new = "new"; success: _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid); - - rxrpc_put_peer(peer); _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); return conn; - /* we found the connection in the list immediately */ -found_extant_connection: - if (!rxrpc_get_connection_maybe(conn)) { - set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &candidate->flags); - rb_replace_node(&conn->service_node, - &candidate->service_node, - &peer->service_conns); - clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags); - goto attached; - } - - if (sp->hdr.securityIndex != conn->security_ix) { - read_unlock_bh(&peer->conn_lock); - goto security_mismatch_put; - } - read_unlock_bh(&peer->conn_lock); - goto success; - - /* we found the connection on the second time through the list */ -found_extant_second: - if (sp->hdr.securityIndex != conn->security_ix) { - write_unlock_bh(&peer->conn_lock); - goto security_mismatch; - } - rxrpc_get_connection(conn); - write_unlock_bh(&peer->conn_lock); - kfree(candidate); +found_extant_connection_rcu: + rcu_read_unlock(); goto success; -security_mismatch_put: - rxrpc_put_connection(conn); -security_mismatch: - kfree(candidate); +security_mismatch_rcu: + rcu_read_unlock(); _leave(" = -EKEYREJECTED"); return ERR_PTR(-EKEYREJECTED); + +enomem_peer: + rxrpc_put_peer(peer); +enomem: + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); } /* @@ -171,8 +223,8 @@ void rxrpc_unpublish_service_conn(struct rxrpc_connection *conn) { struct rxrpc_peer *peer = conn->params.peer; - write_lock_bh(&peer->conn_lock); + write_seqlock_bh(&peer->service_conn_lock); if (test_and_clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags)) rb_erase(&conn->service_node, &peer->service_conns); - write_unlock_bh(&peer->conn_lock); + write_sequnlock_bh(&peer->service_conn_lock); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index c2436476f793..991a20d25093 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -575,13 +575,13 @@ done: * post connection-level events to the connection * - this includes challenges, responses and some aborts */ -static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, +static bool rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, struct sk_buff *skb) { _enter("%p,%p", conn, skb); skb_queue_tail(&conn->rx_queue, skb); - rxrpc_queue_conn(conn); + return rxrpc_queue_conn(conn); } /* @@ -636,6 +636,7 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) */ void rxrpc_data_ready(struct sock *sk) { + struct rxrpc_connection *conn; struct rxrpc_skb_priv *sp; struct rxrpc_local *local = sk->sk_user_data; struct sk_buff *skb; @@ -699,36 +700,37 @@ void rxrpc_data_ready(struct sock *sk) (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) goto bad_message; - if (sp->hdr.callNumber == 0) { - /* This is a connection-level packet. These should be - * fairly rare, so the extra overhead of looking them up the - * old-fashioned way doesn't really hurt */ - struct rxrpc_connection *conn; - - rcu_read_lock(); - conn = rxrpc_find_connection(local, skb); - rcu_read_unlock(); - if (!conn) - goto cant_route_call; + rcu_read_lock(); + +retry_find_conn: + conn = rxrpc_find_connection_rcu(local, skb); + if (!conn) + goto cant_route_call; + if (sp->hdr.callNumber == 0) { + /* Connection-level packet */ _debug("CONN %p {%d}", conn, conn->debug_id); - rxrpc_post_packet_to_conn(conn, skb); - rxrpc_put_connection(conn); + if (!rxrpc_post_packet_to_conn(conn, skb)) + goto retry_find_conn; } else { - struct rxrpc_call *call; + /* Call-bound packets are routed by connection channel. */ + unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK; + struct rxrpc_channel *chan = &conn->channels[channel]; + struct rxrpc_call *call = rcu_dereference(chan->call); - call = rxrpc_find_call_hash(&sp->hdr, local, - AF_INET, &ip_hdr(skb)->saddr); - if (call) - rxrpc_post_packet_to_call(call, skb); - else + if (!call || atomic_read(&call->usage) == 0) goto cant_route_call; + + rxrpc_post_packet_to_call(call, skb); } + rcu_read_unlock(); out: return; cant_route_call: + rcu_read_unlock(); + _debug("can't route call"); if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 01d4930a11f7..538e9831c699 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -189,7 +189,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) INIT_WORK(&peer->error_distributor, &rxrpc_peer_error_distributor); peer->service_conns = RB_ROOT; - rwlock_init(&peer->conn_lock); + seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); } -- cgit v1.2.3 From d440a1ce5d2cf9d90390f6c0d8badc4c0a4f8b6b Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 5 Jul 2016 10:57:10 +0100 Subject: rxrpc: Kill off the call hash table The call hash table is now no longer used as calls are looked up directly by channel slot on the connection, so kill it off. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 13 +--- net/rxrpc/call_object.c | 173 ------------------------------------------------ 2 files changed, 2 insertions(+), 184 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e292276c8539..1bb9e7ac9e14 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -461,19 +461,12 @@ struct rxrpc_call { #define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG) unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1]; - struct hlist_node hash_node; - unsigned long hash_key; /* Full hash key */ - u8 in_clientflag; /* Copy of conn->in_clientflag for hashing */ - struct rxrpc_local *local; /* Local endpoint. Used for hashing. */ - sa_family_t family; /* Frame protocol */ + u8 in_clientflag; /* Copy of conn->in_clientflag */ + struct rxrpc_local *local; /* Local endpoint. */ u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ u32 epoch; /* epoch of this connection */ u16 service_id; /* service ID */ - union { /* Peer IP address for hashing */ - __be32 ipv4_addr; - __u8 ipv6_addr[16]; /* Anticipates eventual IPv6 support */ - } peer_ip; }; /* @@ -521,8 +514,6 @@ extern struct kmem_cache *rxrpc_call_jar; extern struct list_head rxrpc_calls; extern rwlock_t rxrpc_call_lock; -struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *, - void *, sa_family_t, const void *); struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index ebbd7dd5292f..91287c9d01bb 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -61,142 +60,6 @@ static void rxrpc_dead_call_expired(unsigned long _call); static void rxrpc_ack_time_expired(unsigned long _call); static void rxrpc_resend_time_expired(unsigned long _call); -static DEFINE_SPINLOCK(rxrpc_call_hash_lock); -static DEFINE_HASHTABLE(rxrpc_call_hash, 10); - -/* - * Hash function for rxrpc_call_hash - */ -static unsigned long rxrpc_call_hashfunc( - u8 in_clientflag, - u32 cid, - u32 call_id, - u32 epoch, - u16 service_id, - sa_family_t family, - void *localptr, - unsigned int addr_size, - const u8 *peer_addr) -{ - const u16 *p; - unsigned int i; - unsigned long key; - - _enter(""); - - key = (unsigned long)localptr; - /* We just want to add up the __be32 values, so forcing the - * cast should be okay. - */ - key += epoch; - key += service_id; - key += call_id; - key += (cid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT; - key += cid & RXRPC_CHANNELMASK; - key += in_clientflag; - key += family; - /* Step through the peer address in 16-bit portions for speed */ - for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++) - key += *p; - _leave(" key = 0x%lx", key); - return key; -} - -/* - * Add a call to the hashtable - */ -static void rxrpc_call_hash_add(struct rxrpc_call *call) -{ - unsigned long key; - unsigned int addr_size = 0; - - _enter(""); - switch (call->family) { - case AF_INET: - addr_size = sizeof(call->peer_ip.ipv4_addr); - break; - case AF_INET6: - addr_size = sizeof(call->peer_ip.ipv6_addr); - break; - default: - break; - } - key = rxrpc_call_hashfunc(call->in_clientflag, call->cid, - call->call_id, call->epoch, - call->service_id, call->family, - call->conn->params.local, addr_size, - call->peer_ip.ipv6_addr); - /* Store the full key in the call */ - call->hash_key = key; - spin_lock(&rxrpc_call_hash_lock); - hash_add_rcu(rxrpc_call_hash, &call->hash_node, key); - spin_unlock(&rxrpc_call_hash_lock); - _leave(""); -} - -/* - * Remove a call from the hashtable - */ -static void rxrpc_call_hash_del(struct rxrpc_call *call) -{ - _enter(""); - spin_lock(&rxrpc_call_hash_lock); - hash_del_rcu(&call->hash_node); - spin_unlock(&rxrpc_call_hash_lock); - _leave(""); -} - -/* - * Find a call in the hashtable and return it, or NULL if it - * isn't there. - */ -struct rxrpc_call *rxrpc_find_call_hash( - struct rxrpc_host_header *hdr, - void *localptr, - sa_family_t family, - const void *peer_addr) -{ - unsigned long key; - unsigned int addr_size = 0; - struct rxrpc_call *call = NULL; - struct rxrpc_call *ret = NULL; - u8 in_clientflag = hdr->flags & RXRPC_CLIENT_INITIATED; - - _enter(""); - switch (family) { - case AF_INET: - addr_size = sizeof(call->peer_ip.ipv4_addr); - break; - case AF_INET6: - addr_size = sizeof(call->peer_ip.ipv6_addr); - break; - default: - break; - } - - key = rxrpc_call_hashfunc(in_clientflag, hdr->cid, hdr->callNumber, - hdr->epoch, hdr->serviceId, - family, localptr, addr_size, - peer_addr); - hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) { - if (call->hash_key == key && - call->call_id == hdr->callNumber && - call->cid == hdr->cid && - call->in_clientflag == in_clientflag && - call->service_id == hdr->serviceId && - call->family == family && - call->local == localptr && - memcmp(call->peer_ip.ipv6_addr, peer_addr, - addr_size) == 0 && - call->epoch == hdr->epoch) { - ret = call; - break; - } - } - _leave(" = %p", ret); - return ret; -} - /* * find an extant server call * - called in process context with IRQs enabled @@ -305,20 +168,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, call->socket = rx; call->rx_data_post = 1; - /* Record copies of information for hashtable lookup */ - call->family = rx->family; call->local = rx->local; - switch (call->family) { - case AF_INET: - call->peer_ip.ipv4_addr = srx->transport.sin.sin_addr.s_addr; - break; - case AF_INET6: - memcpy(call->peer_ip.ipv6_addr, - srx->transport.sin6.sin6_addr.in6_u.u6_addr8, - sizeof(call->peer_ip.ipv6_addr)); - break; - } - call->service_id = srx->srx_service; call->in_clientflag = 0; @@ -345,9 +195,6 @@ static int rxrpc_begin_client_call(struct rxrpc_call *call, call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; - /* Add the new call to the hashtable */ - rxrpc_call_hash_add(call); - spin_lock(&call->conn->params.peer->lock); hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets); spin_unlock(&call->conn->params.peer->lock); @@ -548,27 +395,10 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, list_add_tail(&call->link, &rxrpc_calls); write_unlock_bh(&rxrpc_call_lock); - /* Record copies of information for hashtable lookup */ - call->family = rx->family; call->local = conn->params.local; - switch (call->family) { - case AF_INET: - call->peer_ip.ipv4_addr = - conn->params.peer->srx.transport.sin.sin_addr.s_addr; - break; - case AF_INET6: - memcpy(call->peer_ip.ipv6_addr, - conn->params.peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, - sizeof(call->peer_ip.ipv6_addr)); - break; - default: - break; - } call->epoch = conn->proto.epoch; call->service_id = conn->params.service_id; call->in_clientflag = RXRPC_CLIENT_INITIATED; - /* Add the new call to the hashtable */ - rxrpc_call_hash_add(call); _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); @@ -818,9 +648,6 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call) ASSERTCMP(call->conn, ==, NULL); - /* Remove the call from the hash */ - rxrpc_call_hash_del(call); - if (call->acks_window) { _debug("kill Tx window %d", CIRC_CNT(call->acks_head, call->acks_tail, -- cgit v1.2.3 From f89e07d4cf2660a2956bc350a201398dda85284e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Jul 2016 14:44:14 +0200 Subject: mac80211: agg-rx: refuse ADDBA Request with timeout update The current implementation of handling ADDBA Request while a session is already active with the peer is wrong - in case the peer is using the existing session's dialog token this should be treated as update to the session, which can update the timeout value. We don't really have a good way of supporting that, so reject, but implement the required behaviour in the spec of "Even if the updated ADDBA Request frame is not accepted, the original Block ACK setup remains active." (802.11-2012 10.5.4) Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 3a8f881b22f1..a9aff6079c42 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -306,6 +306,24 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, mutex_lock(&sta->ampdu_mlme.mtx); if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) { + tid_agg_rx = rcu_dereference_protected( + sta->ampdu_mlme.tid_rx[tid], + lockdep_is_held(&sta->ampdu_mlme.mtx)); + + if (tid_agg_rx->dialog_token == dialog_token) { + ht_dbg_ratelimited(sta->sdata, + "updated AddBA Req from %pM on tid %u\n", + sta->sta.addr, tid); + /* We have no API to update the timeout value in the + * driver so reject the timeout update. + */ + status = WLAN_STATUS_REQUEST_DECLINED; + ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, + tid, dialog_token, status, + 1, buf_size, timeout); + goto end; + } + ht_dbg_ratelimited(sta->sdata, "unexpected AddBA Req from %pM on tid %u\n", sta->sta.addr, tid); -- cgit v1.2.3 From c6e6a0c8be575c830a97b1942dabeab70f423fe0 Mon Sep 17 00:00:00 2001 From: Aviya Erenfeld Date: Tue, 5 Jul 2016 15:23:08 +0300 Subject: nl80211: Add API to support VHT MU-MIMO air sniffer add API to support VHT MU-MIMO air sniffer. in MU-MIMO there are parallel frames on the air while the HW has only one RX. add the capability to sniff one of the MU-MIMO parallel frames by giving the sniffer additional information so it'll know which of the parallel frames it shall follow. Add attribute - NL80211_ATTR_MU_MIMO_GROUP_DATA - for getting a MU-MIMO groupID in order to monitor packets from that group using VHT MU-MIMO. And add attribute -NL80211_ATTR_MU_MIMO_FOLLOW_ADDR - for passing MAC address to monitor mode. that option will be used by VHT MU-MIMO air sniffer to follow a station according to it's MAC address using VHT MU-MIMO. Signed-off-by: Aviya Erenfeld Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 244d552d5647..447026f8cc76 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -405,6 +405,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PBSS] = { .type = NLA_FLAG }, [NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED }, [NL80211_ATTR_STA_SUPPORT_P2P_PS] = { .type = NLA_U8 }, + [NL80211_ATTR_MU_MIMO_GROUP_DATA] = { + .len = VHT_MUMIMO_GROUPS_DATA_LEN + }, + [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, }; /* policy for the key attributes */ @@ -2695,6 +2699,38 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) change = true; } + if (info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]) { + const u8 *mumimo_groups; + u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag)) + return -EOPNOTSUPP; + + mumimo_groups = + nla_data(info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]); + + /* bits 0 and 63 are reserved and must be zero */ + if ((mumimo_groups[0] & BIT(7)) || + (mumimo_groups[VHT_MUMIMO_GROUPS_DATA_LEN - 1] & BIT(0))) + return -EINVAL; + + memcpy(params.vht_mumimo_groups, mumimo_groups, + VHT_MUMIMO_GROUPS_DATA_LEN); + change = true; + } + + if (info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR]) { + u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag)) + return -EOPNOTSUPP; + + nla_memcpy(params.macaddr, + info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR], + ETH_ALEN); + change = true; + } + if (flags && (*flags & MONITOR_FLAG_ACTIVE) && !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) return -EOPNOTSUPP; -- cgit v1.2.3 From 1d76250bd34af86c6498fc51e50cab3bfbbeceaa Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Tue, 5 Jul 2016 17:10:13 +0300 Subject: nl80211: support beacon report scanning Beacon report radio measurement requires reporting observed BSSs on the channels specified in the beacon request. If the measurement mode is set to passive or active, it requires actually performing a scan (passive or active, accordingly), and reporting the time that the scan was started and the time each beacon/probe was received (both in terms of TSF of the BSS of the requesting AP). If the request mode is table, this information is optional. In addition, the radio measurement request specifies the channel dwell time for the measurement. In order to use scan for beacon report when the mode is active or passive, add a parameter to scan request that specifies the channel dwell time, and add scan start time and beacon received time to scan results information. Supporting beacon report is required for Multi Band Operation (MBO). Signed-off-by: Assaf Krauss Signed-off-by: David Spinadel Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 9 +++++++-- net/wireless/core.c | 4 ++-- net/wireless/core.h | 12 ++++++++++++ net/wireless/nl80211.c | 27 +++++++++++++++++++++++++++ net/wireless/scan.c | 18 ++++++++++++------ net/wireless/trace.h | 33 +++++++++++++++++++++++++-------- 6 files changed, 85 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index f9648ef9e31f..4ec1c52a1549 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -353,8 +353,13 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); - if (scan_req != local->int_scan_req) - cfg80211_scan_done(scan_req, aborted); + if (scan_req != local->int_scan_req) { + struct cfg80211_scan_info info = { + .aborted = aborted, + }; + + cfg80211_scan_done(scan_req, &info); + } RCU_INIT_POINTER(local->scan_req, NULL); scan_sdata = rcu_dereference_protected(local->scan_sdata, diff --git a/net/wireless/core.c b/net/wireless/core.c index 39d9abd309ea..7645e97362c0 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -220,7 +220,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified)) - rdev->scan_req->aborted = true; + rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } } @@ -1087,7 +1087,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, cfg80211_update_iface_num(rdev, wdev->iftype, -1); if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified)) - rdev->scan_req->aborted = true; + rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } diff --git a/net/wireless/core.h b/net/wireless/core.h index a4d547f99f8d..eee91443924d 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -141,6 +141,18 @@ struct cfg80211_internal_bss { unsigned long refcount; atomic_t hold; + /* time at the start of the reception of the first octet of the + * timestamp field of the last beacon/probe received for this BSS. + * The time is the TSF of the BSS specified by %parent_bssid. + */ + u64 parent_tsf; + + /* the BSS according to which %parent_tsf is set. This is set to + * the BSS that the interface that requested the scan was connected to + * when the beacon/probe was received. + */ + u8 parent_bssid[ETH_ALEN] __aligned(2); + /* must be last because of priv member */ struct cfg80211_bss pub; }; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 447026f8cc76..c53b5462ed00 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6223,6 +6223,19 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]) { + if (!wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_SET_SCAN_DWELL)) { + err = -EOPNOTSUPP; + goto out_free; + } + + request->duration = + nla_get_u16(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]); + request->duration_mandatory = + nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]); + } + if (info->attrs[NL80211_ATTR_SCAN_FLAGS]) { request->flags = nla_get_u32( info->attrs[NL80211_ATTR_SCAN_FLAGS]); @@ -7056,6 +7069,13 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, jiffies_to_msecs(jiffies - intbss->ts))) goto nla_put_failure; + if (intbss->parent_tsf && + (nla_put_u64_64bit(msg, NL80211_BSS_PARENT_TSF, + intbss->parent_tsf, NL80211_BSS_PAD) || + nla_put(msg, NL80211_BSS_PARENT_BSSID, ETH_ALEN, + intbss->parent_bssid))) + goto nla_put_failure; + if (intbss->ts_boottime && nla_put_u64_64bit(msg, NL80211_BSS_LAST_SEEN_BOOTTIME, intbss->ts_boottime, NL80211_BSS_PAD)) @@ -11829,6 +11849,13 @@ static int nl80211_add_scan_req(struct sk_buff *msg, nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags)) goto nla_put_failure; + if (req->info.scan_start_tsf && + (nla_put_u64_64bit(msg, NL80211_ATTR_SCAN_START_TIME_TSF, + req->info.scan_start_tsf, NL80211_BSS_PAD) || + nla_put(msg, NL80211_ATTR_SCAN_START_TIME_TSF_BSSID, ETH_ALEN, + req->info.tsf_bssid))) + goto nla_put_failure; + return 0; nla_put_failure: return -ENOBUFS; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ef2955c89a00..0358e12be54b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3,6 +3,7 @@ * * Copyright 2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2016 Intel Deutschland GmbH */ #include #include @@ -194,7 +195,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, if (wdev->netdev) cfg80211_sme_scan_done(wdev->netdev); - if (!request->aborted && + if (!request->info.aborted && request->flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); @@ -202,10 +203,10 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, spin_unlock_bh(&rdev->bss_lock); } - msg = nl80211_build_scan_msg(rdev, wdev, request->aborted); + msg = nl80211_build_scan_msg(rdev, wdev, request->info.aborted); #ifdef CONFIG_CFG80211_WEXT - if (wdev->netdev && !request->aborted) { + if (wdev->netdev && !request->info.aborted) { memset(&wrqu, 0, sizeof(wrqu)); wireless_send_event(wdev->netdev, SIOCGIWSCAN, &wrqu, NULL); @@ -236,12 +237,13 @@ void __cfg80211_scan_done(struct work_struct *wk) rtnl_unlock(); } -void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted) +void cfg80211_scan_done(struct cfg80211_scan_request *request, + struct cfg80211_scan_info *info) { - trace_cfg80211_scan_done(request, aborted); + trace_cfg80211_scan_done(request, info); WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req); - request->aborted = aborted; + request->info = *info; request->notified = true; queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk); } @@ -843,6 +845,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, found->pub.capability = tmp->pub.capability; found->ts = tmp->ts; found->ts_boottime = tmp->ts_boottime; + found->parent_tsf = tmp->parent_tsf; + ether_addr_copy(found->parent_bssid, tmp->parent_bssid); } else { struct cfg80211_internal_bss *new; struct cfg80211_internal_bss *hidden; @@ -1086,6 +1090,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); tmp.ts_boottime = data->boottime_ns; + tmp.parent_tsf = data->parent_tsf; + ether_addr_copy(tmp.parent_bssid, data->parent_bssid); signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 3c1091ae6c36..72b5255cefe2 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2642,8 +2642,9 @@ TRACE_EVENT(cfg80211_tdls_oper_request, ); TRACE_EVENT(cfg80211_scan_done, - TP_PROTO(struct cfg80211_scan_request *request, bool aborted), - TP_ARGS(request, aborted), + TP_PROTO(struct cfg80211_scan_request *request, + struct cfg80211_scan_info *info), + TP_ARGS(request, info), TP_STRUCT__entry( __field(u32, n_channels) __dynamic_array(u8, ie, request ? request->ie_len : 0) @@ -2652,6 +2653,8 @@ TRACE_EVENT(cfg80211_scan_done, MAC_ENTRY(wiphy_mac) __field(bool, no_cck) __field(bool, aborted) + __field(u64, scan_start_tsf) + MAC_ENTRY(tsf_bssid) ), TP_fast_assign( if (request) { @@ -2666,9 +2669,16 @@ TRACE_EVENT(cfg80211_scan_done, request->wiphy->perm_addr); __entry->no_cck = request->no_cck; } - __entry->aborted = aborted; + if (info) { + __entry->aborted = info->aborted; + __entry->scan_start_tsf = info->scan_start_tsf; + MAC_ASSIGN(tsf_bssid, info->tsf_bssid); + } ), - TP_printk("aborted: %s", BOOL_TO_STR(__entry->aborted)) + TP_printk("aborted: %s, scan start (TSF): %llu, tsf_bssid: " MAC_PR_FMT, + BOOL_TO_STR(__entry->aborted), + (unsigned long long)__entry->scan_start_tsf, + MAC_PR_ARG(tsf_bssid)) ); DEFINE_EVENT(wiphy_only_evt, cfg80211_sched_scan_results, @@ -2721,6 +2731,8 @@ TRACE_EVENT(cfg80211_inform_bss_frame, __dynamic_array(u8, mgmt, len) __field(s32, signal) __field(u64, ts_boottime) + __field(u64, parent_tsf) + MAC_ENTRY(parent_bssid) ), TP_fast_assign( WIPHY_ASSIGN; @@ -2730,10 +2742,15 @@ TRACE_EVENT(cfg80211_inform_bss_frame, memcpy(__get_dynamic_array(mgmt), mgmt, len); __entry->signal = data->signal; __entry->ts_boottime = data->boottime_ns; - ), - TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d, tsb:%llu", - WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, - __entry->signal, (unsigned long long)__entry->ts_boottime) + __entry->parent_tsf = data->parent_tsf; + MAC_ASSIGN(parent_bssid, data->parent_bssid); + ), + TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT + "(scan_width: %d) signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: " + MAC_PR_FMT, WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, + __entry->signal, (unsigned long long)__entry->ts_boottime, + (unsigned long long)__entry->parent_tsf, + MAC_PR_ARG(parent_bssid)) ); DECLARE_EVENT_CLASS(cfg80211_bss_evt, -- cgit v1.2.3 From 7947d3e075cde1a18e538f2dafbc850aa356ff79 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Tue, 5 Jul 2016 15:23:12 +0300 Subject: mac80211: Add support for beacon report radio measurement Add the following to support beacon report radio measurement with the measurement mode field set to passive or active: 1. Propagate the required scan duration to the device 2. Report the scan start time (in terms of TSF) 3. Report each BSS's detection time (also in terms of TSF) TSF times refer to the BSS that the interface that requested the scan is connected to. Signed-off-by: Assaf Krauss Signed-off-by: Avraham Stern [changed ath9k/10k, at76c59x-usb, iwlegacy, wl1251 and wlcore to match the new API] Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 1 + net/mac80211/scan.c | 42 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 54edfb6fc1d1..f56d342c31b8 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1250,6 +1250,7 @@ struct ieee80211_local { int scan_channel_idx; int scan_ies_len; int hw_scan_ies_bufsize; + struct cfg80211_scan_info scan_info; struct work_struct sched_scan_stopped_work; struct ieee80211_sub_if_data __rcu *sched_scan_sdata; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 4ec1c52a1549..8d4a9cd8a39a 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -7,6 +7,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2013-2015 Intel Mobile Communications GmbH + * Copyright 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -70,6 +71,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local, .boottime_ns = rx_status->boottime_ns, }; bool signal_valid; + struct ieee80211_sub_if_data *scan_sdata; if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) bss_meta.signal = rx_status->signal * 100; @@ -83,6 +85,20 @@ ieee80211_bss_info_update(struct ieee80211_local *local, bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_10; bss_meta.chan = channel; + + rcu_read_lock(); + scan_sdata = rcu_dereference(local->scan_sdata); + if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION && + scan_sdata->vif.bss_conf.assoc && + ieee80211_have_rx_timestamp(rx_status)) { + bss_meta.parent_tsf = + ieee80211_calculate_rx_timestamp(local, rx_status, + len + FCS_LEN, 24); + ether_addr_copy(bss_meta.parent_bssid, + scan_sdata->vif.bss_conf.bssid); + } + rcu_read_unlock(); + cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, len, GFP_ATOMIC); if (!cbss) @@ -345,6 +361,11 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) if (rc == 0) return; + + /* HW scan failed and is going to be reported as done, so clear + * old scan info. + */ + memset(&local->scan_info, 0, sizeof(local->scan_info)); } kfree(local->hw_scan_req); @@ -354,11 +375,8 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) lockdep_is_held(&local->mtx)); if (scan_req != local->int_scan_req) { - struct cfg80211_scan_info info = { - .aborted = aborted, - }; - - cfg80211_scan_done(scan_req, &info); + local->scan_info.aborted = aborted; + cfg80211_scan_done(scan_req, &local->scan_info); } RCU_INIT_POINTER(local->scan_req, NULL); @@ -396,15 +414,19 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) ieee80211_start_next_roc(local); } -void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) +void ieee80211_scan_completed(struct ieee80211_hw *hw, + struct cfg80211_scan_info *info) { struct ieee80211_local *local = hw_to_local(hw); - trace_api_scan_completed(local, aborted); + trace_api_scan_completed(local, info); set_bit(SCAN_COMPLETED, &local->scanning); - if (aborted) + if (info->aborted) set_bit(SCAN_ABORTED, &local->scanning); + + memcpy(&local->scan_info, info, sizeof(*info)); + ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); } EXPORT_SYMBOL(ieee80211_scan_completed); @@ -571,6 +593,9 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->hw_scan_req->req.ie = ies; local->hw_scan_req->req.flags = req->flags; eth_broadcast_addr(local->hw_scan_req->req.bssid); + local->hw_scan_req->req.duration = req->duration; + local->hw_scan_req->req.duration_mandatory = + req->duration_mandatory; local->hw_scan_band = 0; @@ -1078,6 +1103,7 @@ void ieee80211_scan_cancel(struct ieee80211_local *local) */ cancel_delayed_work(&local->scan_work); /* and clean up */ + memset(&local->scan_info, 0, sizeof(local->scan_info)); __ieee80211_scan_completed(&local->hw, true); out: mutex_unlock(&local->mtx); -- cgit v1.2.3 From 7d10f6b179bc82e6633a4521a4cd69ad6846723e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Jul 2016 15:23:13 +0300 Subject: mac80211: report failure to start (partial) scan as scan abort Rather than reporting the scan as having completed, report it as being aborted. Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 8d4a9cd8a39a..070b40f15850 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -362,10 +362,11 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) if (rc == 0) return; - /* HW scan failed and is going to be reported as done, so clear - * old scan info. + /* HW scan failed and is going to be reported as aborted, + * so clear old scan info. */ memset(&local->scan_info, 0, sizeof(local->scan_info)); + aborted = true; } kfree(local->hw_scan_req); -- cgit v1.2.3 From 92b3a28a2b4b2fb777b64f0891a4d3388f617c15 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Jul 2016 15:23:14 +0300 Subject: mac80211: parse wide bandwidth channel switch IE with workaround Continuing the workaround implemented in commit 23665aaf9170 ("mac80211: Interoperability workaround for 80+80 and 160 MHz channels") use the same code to parse the Wide Bandwidth Channel Switch element by converting to VHT Operation element since the spec also just refers to that for parsing semantics, particularly with the workaround. While at it, remove some dead code - the IEEE80211_STA_DISABLE_40MHZ flag can never be set at this point since it's checked earlier and the wide_bw_chansw_ie pointer is set to NULL if it's set. Signed-off-by: Johannes Berg --- net/mac80211/spectmgmt.c | 45 +++++++++++++++++---------------------------- 1 file changed, 17 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 2ddc661f0988..97f4c9d6b54c 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -129,42 +129,31 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, } if (wide_bw_chansw_ie) { - new_vht_chandef.chan = new_chan; - new_vht_chandef.center_freq1 = - ieee80211_channel_to_frequency( + struct ieee80211_vht_operation vht_oper = { + .chan_width = + wide_bw_chansw_ie->new_channel_width, + .center_freq_seg1_idx = wide_bw_chansw_ie->new_center_freq_seg0, - new_band); - - switch (wide_bw_chansw_ie->new_channel_width) { - default: - /* hmmm, ignore VHT and use HT if present */ - case IEEE80211_VHT_CHANWIDTH_USE_HT: + .center_freq_seg2_idx = + wide_bw_chansw_ie->new_center_freq_seg1, + /* .basic_mcs_set doesn't matter */ + }; + + /* default, for the case of IEEE80211_VHT_CHANWIDTH_USE_HT, + * to the previously parsed chandef + */ + new_vht_chandef = csa_ie->chandef; + + /* ignore if parsing fails */ + if (!ieee80211_chandef_vht_oper(&vht_oper, &new_vht_chandef)) new_vht_chandef.chan = NULL; - break; - case IEEE80211_VHT_CHANWIDTH_80MHZ: - new_vht_chandef.width = NL80211_CHAN_WIDTH_80; - break; - case IEEE80211_VHT_CHANWIDTH_160MHZ: - new_vht_chandef.width = NL80211_CHAN_WIDTH_160; - break; - case IEEE80211_VHT_CHANWIDTH_80P80MHZ: - /* field is otherwise reserved */ - new_vht_chandef.center_freq2 = - ieee80211_channel_to_frequency( - wide_bw_chansw_ie->new_center_freq_seg1, - new_band); - new_vht_chandef.width = NL80211_CHAN_WIDTH_80P80; - break; - } + if (sta_flags & IEEE80211_STA_DISABLE_80P80MHZ && new_vht_chandef.width == NL80211_CHAN_WIDTH_80P80) ieee80211_chandef_downgrade(&new_vht_chandef); if (sta_flags & IEEE80211_STA_DISABLE_160MHZ && new_vht_chandef.width == NL80211_CHAN_WIDTH_160) ieee80211_chandef_downgrade(&new_vht_chandef); - if (sta_flags & IEEE80211_STA_DISABLE_40MHZ && - new_vht_chandef.width > NL80211_CHAN_WIDTH_20) - ieee80211_chandef_downgrade(&new_vht_chandef); } /* if VHT data is there validate & use it */ -- cgit v1.2.3 From 7d27a0ba7adc8ef30c2aae7592fce4c162aee4df Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Fri, 1 Jul 2016 10:19:34 +0900 Subject: cfg80211: Add mesh peer AID setting API Previously, mesh power management functionality works only with kernel MPM. Because user space MPM did not report mesh peer AID to kernel, the kernel could not identify the bit in TIM element. So this patch adds mesh peer AID setting API. Signed-off-by: Masashi Honma Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 1 + net/wireless/nl80211.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 0c12e4001f19..47e99ab8d97a 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -997,6 +997,7 @@ static void sta_apply_mesh_params(struct ieee80211_local *local, if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) changed = mesh_plink_inc_estab_count(sdata); sta->mesh->plink_state = params->plink_state; + sta->mesh->aid = params->peer_aid; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c53b5462ed00..5782f718d567 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4446,6 +4446,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]); if (params.plink_state >= NUM_NL80211_PLINK_STATES) return -EINVAL; + if (info->attrs[NL80211_ATTR_MESH_PEER_AID]) { + params.peer_aid = nla_get_u16( + info->attrs[NL80211_ATTR_MESH_PEER_AID]); + if (params.peer_aid > IEEE80211_MAX_AID) + return -EINVAL; + } params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE; } -- cgit v1.2.3 From be2cef49904b34dd5f75d96bbc8cd8341bab1bc0 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 3 Jun 2016 17:56:50 +0200 Subject: ipvs: count pre-established TCP states as active Some users observed that "least connection" distribution algorithm doesn't handle well bursts of TCP connections from reconnecting clients after a node or network failure. This is because the algorithm counts active connection as worth 256 inactive ones where for TCP, "active" only means TCP connections in ESTABLISHED state. In case of a connection burst, new connections are handled before previous ones have finished the three way handshaking so that all are still counted as "inactive", i.e. cheap ones. The become "active" quickly but at that time, all of them are already assigned to one real server (or few), resulting in highly unbalanced distribution. Address this by counting the "pre-established" states as "active". Signed-off-by: Michal Kubecek Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_proto_tcp.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index d7024b2ed769..5117bcb7d2f0 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -395,6 +395,20 @@ static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_LAST] = "BUG!", }; +static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = { + [IP_VS_TCP_S_NONE] = false, + [IP_VS_TCP_S_ESTABLISHED] = true, + [IP_VS_TCP_S_SYN_SENT] = true, + [IP_VS_TCP_S_SYN_RECV] = true, + [IP_VS_TCP_S_FIN_WAIT] = false, + [IP_VS_TCP_S_TIME_WAIT] = false, + [IP_VS_TCP_S_CLOSE] = false, + [IP_VS_TCP_S_CLOSE_WAIT] = false, + [IP_VS_TCP_S_LAST_ACK] = false, + [IP_VS_TCP_S_LISTEN] = false, + [IP_VS_TCP_S_SYNACK] = true, +}; + #define sNO IP_VS_TCP_S_NONE #define sES IP_VS_TCP_S_ESTABLISHED #define sSS IP_VS_TCP_S_SYN_SENT @@ -418,6 +432,13 @@ static const char * tcp_state_name(int state) return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; } +static bool tcp_state_active(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return false; + return tcp_state_active_table[state]; +} + static struct tcp_states_t tcp_states [] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ @@ -540,12 +561,12 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state != IP_VS_TCP_S_ESTABLISHED)) { + !tcp_state_active(new_state)) { atomic_dec(&dest->activeconns); atomic_inc(&dest->inactconns); cp->flags |= IP_VS_CONN_F_INACTIVE; } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state == IP_VS_TCP_S_ESTABLISHED)) { + tcp_state_active(new_state)) { atomic_inc(&dest->activeconns); atomic_dec(&dest->inactconns); cp->flags &= ~IP_VS_CONN_F_INACTIVE; -- cgit v1.2.3 From a4770e1117f193c3e27f5f046cd4f8e2470f3b70 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 26 Jun 2016 14:55:23 -0700 Subject: Bluetooth: Switch SMP to crypto_cipher_encrypt_one() SMP does ECB crypto on stack buffers. This is complicated and fragile, and it will not work if the stack is virtually allocated. Switch to the crypto_cipher interface, which is simpler and safer. Signed-off-by: Andy Lutomirski Acked-by: Herbert Xu Acked-by: Johan Hedberg Tested-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 67 ++++++++++++++++++++++------------------------------- 1 file changed, 28 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 50976a6481f3..4c1a16a96ae5 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -22,9 +22,9 @@ #include #include +#include #include #include -#include #include #include @@ -88,7 +88,7 @@ struct smp_dev { u8 min_key_size; u8 max_key_size; - struct crypto_skcipher *tfm_aes; + struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; }; @@ -127,7 +127,7 @@ struct smp_chan { u8 dhkey[32]; u8 mackey[16]; - struct crypto_skcipher *tfm_aes; + struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; }; @@ -361,10 +361,8 @@ static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16], * s1 and ah. */ -static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r) +static int smp_e(struct crypto_cipher *tfm, const u8 *k, u8 *r) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); - struct scatterlist sg; uint8_t tmp[16], data[16]; int err; @@ -378,7 +376,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r) /* The most significant octet of key corresponds to k[0] */ swap_buf(k, tmp, 16); - err = crypto_skcipher_setkey(tfm, tmp, 16); + err = crypto_cipher_setkey(tfm, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; @@ -387,16 +385,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r) /* Most significant octet of plaintextData corresponds to data[0] */ swap_buf(r, data, 16); - sg_init_one(&sg, data, 16); - - skcipher_request_set_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, 16, NULL); - - err = crypto_skcipher_encrypt(req); - skcipher_request_zero(req); - if (err) - BT_ERR("Encrypt data error %d", err); + crypto_cipher_encrypt_one(tfm, data, data); /* Most significant octet of encryptedData corresponds to data[0] */ swap_buf(data, r, 16); @@ -406,7 +395,7 @@ static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r) return err; } -static int smp_c1(struct crypto_skcipher *tfm_aes, const u8 k[16], +static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16], const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat, const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16]) { @@ -455,7 +444,7 @@ static int smp_c1(struct crypto_skcipher *tfm_aes, const u8 k[16], return err; } -static int smp_s1(struct crypto_skcipher *tfm_aes, const u8 k[16], +static int smp_s1(struct crypto_cipher *tfm_aes, const u8 k[16], const u8 r1[16], const u8 r2[16], u8 _r[16]) { int err; @@ -471,7 +460,7 @@ static int smp_s1(struct crypto_skcipher *tfm_aes, const u8 k[16], return err; } -static int smp_ah(struct crypto_skcipher *tfm, const u8 irk[16], +static int smp_ah(struct crypto_cipher *tfm, const u8 irk[16], const u8 r[3], u8 res[3]) { u8 _res[16]; @@ -759,7 +748,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn) kzfree(smp->slave_csrk); kzfree(smp->link_key); - crypto_free_skcipher(smp->tfm_aes); + crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); /* Ensure that we don't leave any debug key around if debug key @@ -1359,9 +1348,9 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) if (!smp) return NULL; - smp->tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + smp->tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(smp->tfm_aes)) { - BT_ERR("Unable to create ECB crypto context"); + BT_ERR("Unable to create AES crypto context"); kzfree(smp); return NULL; } @@ -1369,7 +1358,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(smp->tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_skcipher(smp->tfm_aes); + crypto_free_cipher(smp->tfm_aes); kzfree(smp); return NULL; } @@ -3120,7 +3109,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) { struct l2cap_chan *chan; struct smp_dev *smp; - struct crypto_skcipher *tfm_aes; + struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; if (cid == L2CAP_CID_SMP_BREDR) { @@ -3132,9 +3121,9 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) if (!smp) return ERR_PTR(-ENOMEM); - tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_aes)) { - BT_ERR("Unable to create ECB crypto context"); + BT_ERR("Unable to create AES crypto context"); kzfree(smp); return ERR_CAST(tfm_aes); } @@ -3142,7 +3131,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_skcipher(tfm_aes); + crypto_free_cipher(tfm_aes); kzfree(smp); return ERR_CAST(tfm_cmac); } @@ -3156,7 +3145,7 @@ create_chan: chan = l2cap_chan_create(); if (!chan) { if (smp) { - crypto_free_skcipher(smp->tfm_aes); + crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); kzfree(smp); } @@ -3203,7 +3192,7 @@ static void smp_del_chan(struct l2cap_chan *chan) smp = chan->data; if (smp) { chan->data = NULL; - crypto_free_skcipher(smp->tfm_aes); + crypto_free_cipher(smp->tfm_aes); crypto_free_shash(smp->tfm_cmac); kzfree(smp); } @@ -3440,7 +3429,7 @@ void smp_unregister(struct hci_dev *hdev) #if IS_ENABLED(CONFIG_BT_SELFTEST_SMP) -static int __init test_ah(struct crypto_skcipher *tfm_aes) +static int __init test_ah(struct crypto_cipher *tfm_aes) { const u8 irk[16] = { 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, @@ -3460,7 +3449,7 @@ static int __init test_ah(struct crypto_skcipher *tfm_aes) return 0; } -static int __init test_c1(struct crypto_skcipher *tfm_aes) +static int __init test_c1(struct crypto_cipher *tfm_aes) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -3490,7 +3479,7 @@ static int __init test_c1(struct crypto_skcipher *tfm_aes) return 0; } -static int __init test_s1(struct crypto_skcipher *tfm_aes) +static int __init test_s1(struct crypto_cipher *tfm_aes) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -3686,7 +3675,7 @@ static const struct file_operations test_smp_fops = { .llseek = default_llseek, }; -static int __init run_selftests(struct crypto_skcipher *tfm_aes, +static int __init run_selftests(struct crypto_cipher *tfm_aes, struct crypto_shash *tfm_cmac) { ktime_t calltime, delta, rettime; @@ -3764,27 +3753,27 @@ done: int __init bt_selftest_smp(void) { - struct crypto_skcipher *tfm_aes; + struct crypto_cipher *tfm_aes; struct crypto_shash *tfm_cmac; int err; - tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_aes)) { - BT_ERR("Unable to create ECB crypto context"); + BT_ERR("Unable to create AES crypto context"); return PTR_ERR(tfm_aes); } tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); - crypto_free_skcipher(tfm_aes); + crypto_free_cipher(tfm_aes); return PTR_ERR(tfm_cmac); } err = run_selftests(tfm_aes, tfm_cmac); crypto_free_shash(tfm_cmac); - crypto_free_skcipher(tfm_aes); + crypto_free_cipher(tfm_aes); return err; } -- cgit v1.2.3 From 929946a471c1d5c6c595b4094f4c56bdfceee9c7 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 21 Jun 2016 10:27:18 +0200 Subject: 6lowpan: ndisc: fix double read unlock This patch removes a double unlock case to accessing neighbour private data. Reported-by: Dan Carpenter Signed-off-by: Alexander Aring Reviewed-by: Stefan Schmidt Signed-off-by: Marcel Holtmann --- net/6lowpan/ndisc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c index ae1d4199aa4c..030504e031b6 100644 --- a/net/6lowpan/ndisc.c +++ b/net/6lowpan/ndisc.c @@ -135,8 +135,9 @@ static int lowpan_ndisc_opt_addr_space(const struct net_device *dev, read_unlock_bh(&neigh->lock); addr_space += __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0); *ha = ha_buf; + } else { + read_unlock_bh(&neigh->lock); } - read_unlock_bh(&neigh->lock); break; case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: -- cgit v1.2.3 From 966be9e7909d616b03e644acd8a83f09bf023c5c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 21 Jun 2016 10:27:19 +0200 Subject: 6lowpan: ndisc: add missing 802.15.4 only check This patch adds a missing check to handle short address parsing for 802.15.4 6LoWPAN only. Signed-off-by: Alexander Aring Reviewed-by: Stefan Schmidt Signed-off-by: Marcel Holtmann --- net/6lowpan/ndisc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c index 030504e031b6..79c5fa9665fc 100644 --- a/net/6lowpan/ndisc.c +++ b/net/6lowpan/ndisc.c @@ -47,6 +47,9 @@ static int lowpan_ndisc_parse_options(const struct net_device *dev, struct nd_opt_hdr *nd_opt, struct ndisc_options *ndopts) { + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return 0; + switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: -- cgit v1.2.3 From 66e5c2672cd11b9310008099faf6a4ffb9dfb6d0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 18 Jun 2016 10:45:34 +0200 Subject: ieee802154: add netns support This patch adds netns support for 802.15.4 subsystem. Most parts are copy&pasted from wireless subsystem, it has the identically userspace API. Cc: Nicolas Dichtel Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/core.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++- net/ieee802154/core.h | 2 ++ net/ieee802154/nl802154.c | 54 ++++++++++++++++++++++++++++++++---- 3 files changed, 120 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index c35fdfa6d04e..cb7176cd4cd6 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -140,6 +140,8 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) rdev->wpan_phy.dev.class = &wpan_phy_class; rdev->wpan_phy.dev.platform_data = rdev; + wpan_phy_net_set(&rdev->wpan_phy, &init_net); + init_waitqueue_head(&rdev->dev_wait); return &rdev->wpan_phy; @@ -207,6 +209,49 @@ void wpan_phy_free(struct wpan_phy *phy) } EXPORT_SYMBOL(wpan_phy_free); +int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, + struct net *net) +{ + struct wpan_dev *wpan_dev; + int err = 0; + + list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { + if (!wpan_dev->netdev) + continue; + wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); + if (err) + break; + wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; + } + + if (err) { + /* failed -- clean up to old netns */ + net = wpan_phy_net(&rdev->wpan_phy); + + list_for_each_entry_continue_reverse(wpan_dev, + &rdev->wpan_dev_list, + list) { + if (!wpan_dev->netdev) + continue; + wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + err = dev_change_net_namespace(wpan_dev->netdev, net, + "wpan%d"); + WARN_ON(err); + wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; + } + + return err; + } + + wpan_phy_net_set(&rdev->wpan_phy, net); + + err = device_rename(&rdev->wpan_phy.dev, dev_name(&rdev->wpan_phy.dev)); + WARN_ON(err); + + return 0; +} + void cfg802154_dev_free(struct cfg802154_registered_device *rdev) { kfree(rdev); @@ -286,14 +331,34 @@ static struct notifier_block cfg802154_netdev_notifier = { .notifier_call = cfg802154_netdev_notifier_call, }; +static void __net_exit cfg802154_pernet_exit(struct net *net) +{ + struct cfg802154_registered_device *rdev; + + rtnl_lock(); + list_for_each_entry(rdev, &cfg802154_rdev_list, list) { + if (net_eq(wpan_phy_net(&rdev->wpan_phy), net)) + WARN_ON(cfg802154_switch_netns(rdev, &init_net)); + } + rtnl_unlock(); +} + +static struct pernet_operations cfg802154_pernet_ops = { + .exit = cfg802154_pernet_exit, +}; + static int __init wpan_phy_class_init(void) { int rc; - rc = wpan_phy_sysfs_init(); + rc = register_pernet_device(&cfg802154_pernet_ops); if (rc) goto err; + rc = wpan_phy_sysfs_init(); + if (rc) + goto err_sysfs; + rc = register_netdevice_notifier(&cfg802154_netdev_notifier); if (rc) goto err_nl; @@ -315,6 +380,8 @@ err_notifier: unregister_netdevice_notifier(&cfg802154_netdev_notifier); err_nl: wpan_phy_sysfs_exit(); +err_sysfs: + unregister_pernet_device(&cfg802154_pernet_ops); err: return rc; } @@ -326,6 +393,7 @@ static void __exit wpan_phy_class_exit(void) ieee802154_nl_exit(); unregister_netdevice_notifier(&cfg802154_netdev_notifier); wpan_phy_sysfs_exit(); + unregister_pernet_device(&cfg802154_pernet_ops); } module_exit(wpan_phy_class_exit); diff --git a/net/ieee802154/core.h b/net/ieee802154/core.h index 231fade959f3..81141f58d079 100644 --- a/net/ieee802154/core.h +++ b/net/ieee802154/core.h @@ -38,6 +38,8 @@ wpan_phy_to_rdev(struct wpan_phy *wpan_phy) extern struct list_head cfg802154_rdev_list; extern int cfg802154_rdev_list_generation; +int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, + struct net *net); /* free object */ void cfg802154_dev_free(struct cfg802154_registered_device *rdev); struct cfg802154_registered_device * diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 116187b5c267..d90a4ed5b8a0 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -80,7 +80,8 @@ __cfg802154_wpan_dev_from_attrs(struct net *netns, struct nlattr **attrs) list_for_each_entry(rdev, &cfg802154_rdev_list, list) { struct wpan_dev *wpan_dev; - /* TODO netns compare */ + if (wpan_phy_net(&rdev->wpan_phy) != netns) + continue; if (have_wpan_dev_id && rdev->wpan_phy_idx != wpan_phy_idx) continue; @@ -175,7 +176,8 @@ __cfg802154_rdev_from_attrs(struct net *netns, struct nlattr **attrs) if (!rdev) return ERR_PTR(-ENODEV); - /* TODO netns compare */ + if (netns != wpan_phy_net(&rdev->wpan_phy)) + return ERR_PTR(-ENODEV); return rdev; } @@ -233,6 +235,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 }, + [NL802154_ATTR_PID] = { .type = NLA_U32 }, + [NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 }, #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, }, [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, }, @@ -590,7 +594,6 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct cfg802154_registered_device *rdev; int ifidx = nla_get_u32(tb[NL802154_ATTR_IFINDEX]); - /* TODO netns */ netdev = __dev_get_by_index(&init_net, ifidx); if (!netdev) return -ENODEV; @@ -629,7 +632,8 @@ nl802154_dump_wpan_phy(struct sk_buff *skb, struct netlink_callback *cb) } list_for_each_entry(rdev, &cfg802154_rdev_list, list) { - /* TODO net ns compare */ + if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk))) + continue; if (++idx <= state->start) continue; if (state->filter_wpan_phy != -1 && @@ -871,7 +875,8 @@ nl802154_dump_interface(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); list_for_each_entry(rdev, &cfg802154_rdev_list, list) { - /* TODO netns compare */ + if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk))) + continue; if (wp_idx < wp_start) { wp_idx++; continue; @@ -1271,6 +1276,37 @@ nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) return rdev_set_ackreq_default(rdev, wpan_dev, ackreq); } +static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net *net; + int err; + + if (info->attrs[NL802154_ATTR_PID]) { + u32 pid = nla_get_u32(info->attrs[NL802154_ATTR_PID]); + + net = get_net_ns_by_pid(pid); + } else if (info->attrs[NL802154_ATTR_NETNS_FD]) { + u32 fd = nla_get_u32(info->attrs[NL802154_ATTR_NETNS_FD]); + + net = get_net_ns_by_fd(fd); + } else { + return -EINVAL; + } + + if (IS_ERR(net)) + return PTR_ERR(net); + + err = 0; + + /* check if anything to do */ + if (!net_eq(wpan_phy_net(&rdev->wpan_phy), net)) + err = cfg802154_switch_netns(rdev, net); + + put_net(net); + return err; +} + #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = { [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 }, @@ -2261,6 +2297,14 @@ static const struct genl_ops nl802154_ops[] = { .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, + { + .cmd = NL802154_CMD_SET_WPAN_PHY_NETNS, + .doit = nl802154_wpan_phy_netns, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | + NL802154_FLAG_NEED_RTNL, + }, { .cmd = NL802154_CMD_SET_PAN_ID, .doit = nl802154_set_pan_id, -- cgit v1.2.3 From 1c5bf998b3dca0599a2cce885619ffc06fc594df Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 18 Jun 2016 10:45:35 +0200 Subject: ieee802154: allow netns create of lowpan interface This patch reverts commit f9d1ce8f81eb ("ieee802154: fix netns settings"). The lowpan interface need to be created inside the net namespace where the wpan interface is available. The wpan namespace can be changed only by nl802154 before. Without this patch it's not possible to create a lowpan interface for a wpan interface which isn't inside init_net namespace. Cc: Nicolas Dichtel Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 935ab932e841..d7efbf0dad20 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -130,8 +130,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, pr_debug("adding new link\n"); - if (!tb[IFLA_LINK] || - !net_eq(dev_net(ldev), &init_net)) + if (!tb[IFLA_LINK]) return -EINVAL; /* find and hold wpan device */ wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK])); -- cgit v1.2.3 From b5f34f9420b50c9b5876b9a2b68e96be6d629054 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 27 Jun 2016 11:01:12 -0500 Subject: Bluetooth: Fix bt_sock_recvmsg return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If recvmsg is called with a destination buffer that is too small to receive the contents of skb in its entirety, the return value from recvmsg was inconsistent with common SOCK_SEQPACKET or SOCK_DGRAM semantics. If destination buffer provided by userspace is too small (e.g. len < copied), then MSG_TRUNC flag is set and copied is returned. Instead, it should return the length of the message, which is consistent with how other datagram based sockets act. Quoting 'man recv': "All three calls return the length of the message on successful comple‐ tion. If a message is too long to fit in the supplied buffer, excess bytes may be discarded depending on the type of socket the message is received from." and "MSG_TRUNC (since Linux 2.2) For raw (AF_PACKET), Internet datagram (since Linux 2.4.27/2.6.8), netlink (since Linux 2.6.22), and UNIX datagram (since Linux 3.4) sockets: return the real length of the packet or datagram, even when it was longer than the passed buffer." Signed-off-by: Denis Kenzior Signed-off-by: Marcel Holtmann --- net/bluetooth/af_bluetooth.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 3df7aefb7663..ece45e0683fd 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -215,6 +215,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct sock *sk = sock->sk; struct sk_buff *skb; size_t copied; + size_t skblen; int err; BT_DBG("sock %p sk %p len %zu", sock, sk, len); @@ -230,6 +231,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, return err; } + skblen = skb->len; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; @@ -248,6 +250,9 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, skb_free_datagram(sk, skb); + if (msg->msg_flags & MSG_TRUNC) + copied = skblen; + return err ? : copied; } EXPORT_SYMBOL(bt_sock_recvmsg); -- cgit v1.2.3 From 83871f8ccdfa8f3edab15d432aa4f3eb82953343 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 27 Jun 2016 11:01:13 -0500 Subject: Bluetooth: Fix hci_sock_recvmsg return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If recvmsg is called with a destination buffer that is too small to receive the contents of skb in its entirety, the return value from recvmsg was inconsistent with common SOCK_SEQPACKET or SOCK_DGRAM semantics. If destination buffer provided by userspace is too small (e.g. len < copied), then MSG_TRUNC flag is set and copied is returned. Instead, it should return the length of the message, which is consistent with how other datagram based sockets act. Quoting 'man recv': "All three calls return the length of the message on successful comple‐ tion. If a message is too long to fit in the supplied buffer, excess bytes may be discarded depending on the type of socket the message is received from." and "MSG_TRUNC (since Linux 2.2) For raw (AF_PACKET), Internet datagram (since Linux 2.4.27/2.6.8), netlink (since Linux 2.6.22), and UNIX datagram (since Linux 3.4) sockets: return the real length of the packet or datagram, even when it was longer than the passed buffer." Signed-off-by: Denis Kenzior Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sock.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1298d723c0e0..12e9294b02e0 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1048,6 +1048,7 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; + unsigned int skblen; BT_DBG("sock %p, sk %p", sock, sk); @@ -1064,6 +1065,7 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, if (!skb) return err; + skblen = skb->len; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; @@ -1089,6 +1091,9 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, skb_free_datagram(sk, skb); + if (msg->msg_flags & MSG_TRUNC) + copied = skblen; + return err ? : copied; } -- cgit v1.2.3 From 0ea0b9af9b7599ada307258dc841f4300873e8a1 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 6 Jul 2016 23:32:26 +0200 Subject: ieee802154: 6lowpan: fix intra pan id check The RIOT-OS stack does send intra-pan frames but don't set the intra pan flag inside the mac header. It seems this is valid frame addressing but inefficient. Anyway this patch adds a new function for intra pan addressing, doesn't matter if intra pan flag or source and destination are the same. The newly introduction function will be used to check on intra pan addressing for 6lowpan. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c index ef185dd4110d..649e7d45e88f 100644 --- a/net/ieee802154/6lowpan/rx.c +++ b/net/ieee802154/6lowpan/rx.c @@ -262,7 +262,7 @@ static inline bool lowpan_rx_h_check(struct sk_buff *skb) /* check on ieee802154 conform 6LoWPAN header */ if (!ieee802154_is_data(fc) || - !ieee802154_is_intra_pan(fc)) + !ieee802154_skb_is_intra_pan_addressing(fc, skb)) return false; /* check if we can dereference the dispatch */ -- cgit v1.2.3 From 9e262f5037b95a9ccc508debec2715e70559cc81 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 6 Jul 2016 23:32:28 +0200 Subject: 6lowpan: ndisc: set invalid unicast short addr to unspec When receiving neighbour information with short address option field we should check the complete range of invalid short addresses and set it to one invalid address setting which is the unspecified address. This address is also used when by creating at first a new neighbour entry to indicate no short address is set. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/6lowpan/ndisc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c index 79c5fa9665fc..86450b7e2899 100644 --- a/net/6lowpan/ndisc.c +++ b/net/6lowpan/ndisc.c @@ -97,10 +97,13 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags, } write_lock_bh(&n->lock); - if (lladdr_short) + if (lladdr_short) { ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short); - else + if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr)) + neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + } else { neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + } write_unlock_bh(&n->lock); } -- cgit v1.2.3 From bba7eb5d9b4ebccacd30331121ee29764212a29d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Jul 2016 16:22:20 +0200 Subject: hfsc: reduce hfsc_sched to 14 cachelines hfsc_sched is huge (size: 920, cachelines: 15), but we can get it to 14 cachelines by placing level after filter_cnt (covering 4 byte hole) and reducing period/nactive/flags to u32 (period is just a counter, incremented when class becomes active -- 2**32 is plenty for this purpose, also, long is only 32bit wide on 32bit platforms anyway). cl_vtperiod is exported to userspace via tc_hfsc_stats, but its period member is already u32, so no precision is lost there either. Cc: Michal Soltys Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index dff92ea772fe..3ddc7bd74ecb 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -115,9 +115,9 @@ struct hfsc_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; struct gnet_stats_rate_est64 rate_est; - unsigned int level; /* class level in hierarchy */ struct tcf_proto __rcu *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ + unsigned int level; /* class level in hierarchy */ struct hfsc_sched *sched; /* scheduler data */ struct hfsc_class *cl_parent; /* parent class */ @@ -165,10 +165,10 @@ struct hfsc_class { struct runtime_sc cl_virtual; /* virtual curve */ struct runtime_sc cl_ulimit; /* upperlimit curve */ - unsigned long cl_flags; /* which curves are valid */ - unsigned long cl_vtperiod; /* vt period sequence number */ - unsigned long cl_parentperiod;/* parent's vt period sequence number*/ - unsigned long cl_nactive; /* number of active children */ + u8 cl_flags; /* which curves are valid */ + u32 cl_vtperiod; /* vt period sequence number */ + u32 cl_parentperiod;/* parent's vt period sequence number*/ + u32 cl_nactive; /* number of active children */ }; struct hfsc_sched { -- cgit v1.2.3 From d390238c4fba7c87a3bcd859ce3373c864eb7b02 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 6 Jul 2016 20:03:54 -0400 Subject: net: dsa: initialize the routing table The routing table of every switch in a tree is currently initialized to all zeros. This is an issue since 0 is a valid port number. Add a DSA_RTABLE_NONE=-1 constant to initialize the signed values of the routing table pointing to other switches. This fixes the device mapping of the mv88e6xxx driver where the port pointing to the switch itself and to non-existent switches was wrongly configured to be 0. It is now set to the expected 0xf value. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa.c | 6 ++++++ net/dsa/dsa2.c | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 766d2a525ada..7e68bc6bc853 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -774,11 +774,17 @@ static int dsa_of_probe(struct device *dev) chip_index = -1; for_each_available_child_of_node(np, child) { + int i; + chip_index++; cd = &pd->chip[chip_index]; cd->of_node = child; + /* Initialize the routing table */ + for (i = 0; i < DSA_MAX_SWITCHES; ++i) + cd->rtable[i] = DSA_RTABLE_NONE; + /* When assigning the host device, increment its refcount */ cd->host_dev = get_device(&mdio_bus->dev); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 83b95fc4cede..78e4c0131c30 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -595,7 +595,7 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) struct device_node *ports = dsa_get_ports(ds, np); struct dsa_switch_tree *dst; u32 tree, index; - int err; + int i, err; err = dsa_parse_member(np, &tree, &index); if (err) @@ -622,6 +622,11 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) ds->dst = dst; ds->index = index; + + /* Initialize the routing table */ + for (i = 0; i < DSA_MAX_SWITCHES; ++i) + ds->rtable[i] = DSA_RTABLE_NONE; + dsa_dst_add_ds(dst, ds, index); err = dsa_dst_complete(dst); -- cgit v1.2.3 From f1533cce60d1f84378c1dd925f9ef1038fa93507 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 7 Jul 2016 09:39:29 -0300 Subject: sctp: fix panic when sending auth chunks When we introduced GSO support, if using auth the auth chunk was being left queued on the packet even after the final segment was generated. Later on sctp_transmit_packet it calls sctp_packet_reset, which zeroed the packet len while not accounting for this left-over. This caused more space to be used the next packet due to the chunk still being queued, but space which wasn't allocated as its size wasn't accounted. The fix is to only queue it back when we know that we are going to generate another segment. Fixes: 90017accff61 ("sctp: Add GSO support") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/output.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/output.c b/net/sctp/output.c index 1541a91d6d9d..2e9223bb1b3a 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -582,9 +582,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) */ pkt_size -= WORD_ROUND(chunk->skb->len); - if (chunk == packet->auth && !list_empty(&packet->chunk_list)) - list_add(&chunk->list, &packet->chunk_list); - else if (!sctp_chunk_is_data(chunk)) + if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) sctp_chunk_free(chunk); if (!pkt_size) @@ -605,6 +603,18 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) (struct sctp_auth_chunk *)auth, gfp); + if (packet->auth) { + if (!list_empty(&packet->chunk_list)) { + /* We will generate more packets, so re-queue + * auth chunk. + */ + list_add(&chunk->list, &packet->chunk_list); + } else { + sctp_chunk_free(packet->auth); + packet->auth = NULL; + } + } + if (!gso) break; @@ -735,6 +745,8 @@ err: } goto out; nomem: + if (packet->auth && list_empty(&packet->auth->list)) + sctp_chunk_free(packet->auth); err = -ENOMEM; goto err; } -- cgit v1.2.3 From 2a0be139868cae2465f9ed5b599203fa4f8e06ca Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 5 Jul 2016 14:30:12 +0200 Subject: Bluetooth: Remove connection link attributes The connection link attributes are not used and expose no valuable information. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sysfs.c | 45 --------------------------------------------- 1 file changed, 45 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 555982a78a58..6bfdca8541d6 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -7,50 +7,6 @@ static struct class *bt_class; -static inline char *link_typetostr(int type) -{ - switch (type) { - case ACL_LINK: - return "ACL"; - case SCO_LINK: - return "SCO"; - case ESCO_LINK: - return "eSCO"; - case LE_LINK: - return "LE"; - default: - return "UNKNOWN"; - } -} - -static ssize_t show_link_type(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hci_conn *conn = to_hci_conn(dev); - return sprintf(buf, "%s\n", link_typetostr(conn->type)); -} - -static ssize_t show_link_address(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hci_conn *conn = to_hci_conn(dev); - return sprintf(buf, "%pMR\n", &conn->dst); -} - -#define LINK_ATTR(_name, _mode, _show, _store) \ -struct device_attribute link_attr_##_name = __ATTR(_name, _mode, _show, _store) - -static LINK_ATTR(type, S_IRUGO, show_link_type, NULL); -static LINK_ATTR(address, S_IRUGO, show_link_address, NULL); - -static struct attribute *bt_link_attrs[] = { - &link_attr_type.attr, - &link_attr_address.attr, - NULL -}; - -ATTRIBUTE_GROUPS(bt_link); - static void bt_link_release(struct device *dev) { struct hci_conn *conn = to_hci_conn(dev); @@ -59,7 +15,6 @@ static void bt_link_release(struct device *dev) static struct device_type bt_link = { .name = "link", - .groups = bt_link_groups, .release = bt_link_release, }; -- cgit v1.2.3 From e14dbe72033152135eb3bae212228728089d4dd9 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 5 Jul 2016 14:30:13 +0200 Subject: Bluetooth: Remove controller device attributes The controller device attributes are not used and expose no valuable information. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sysfs.c | 54 ----------------------------------------------- 1 file changed, 54 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 6bfdca8541d6..ca7a35ebaefb 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -79,59 +79,6 @@ void hci_conn_del_sysfs(struct hci_conn *conn) hci_dev_put(hdev); } -static inline char *host_typetostr(int type) -{ - switch (type) { - case HCI_BREDR: - return "BR/EDR"; - case HCI_AMP: - return "AMP"; - default: - return "UNKNOWN"; - } -} - -static ssize_t show_type(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hci_dev *hdev = to_hci_dev(dev); - return sprintf(buf, "%s\n", host_typetostr(hdev->dev_type)); -} - -static ssize_t show_name(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hci_dev *hdev = to_hci_dev(dev); - char name[HCI_MAX_NAME_LENGTH + 1]; - int i; - - for (i = 0; i < HCI_MAX_NAME_LENGTH; i++) - name[i] = hdev->dev_name[i]; - - name[HCI_MAX_NAME_LENGTH] = '\0'; - return sprintf(buf, "%s\n", name); -} - -static ssize_t show_address(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hci_dev *hdev = to_hci_dev(dev); - return sprintf(buf, "%pMR\n", &hdev->bdaddr); -} - -static DEVICE_ATTR(type, S_IRUGO, show_type, NULL); -static DEVICE_ATTR(name, S_IRUGO, show_name, NULL); -static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); - -static struct attribute *bt_host_attrs[] = { - &dev_attr_type.attr, - &dev_attr_name.attr, - &dev_attr_address.attr, - NULL -}; - -ATTRIBUTE_GROUPS(bt_host); - static void bt_host_release(struct device *dev) { struct hci_dev *hdev = to_hci_dev(dev); @@ -141,7 +88,6 @@ static void bt_host_release(struct device *dev) static struct device_type bt_host = { .name = "host", - .groups = bt_host_groups, .release = bt_host_release, }; -- cgit v1.2.3 From ca8bee5dde1f02c2dbe8c8453dce27f2dfafb21c Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 5 Jul 2016 14:30:14 +0200 Subject: Bluetooth: Rename HCI_BREDR into HCI_PRIMARY The HCI_BREDR naming is confusing since it actually stands for Primary Bluetooth Controller. Which is a term that has been used in the latest standard. However from a legacy point of view there only really have been Basic Rate (BR) and Enhanced Data Rate (EDR). Recent versions of Bluetooth introduced Low Energy (LE) and made this terminology a little bit confused since Dual Mode Controllers include BR/EDR and LE. To simplify this the name HCI_PRIMARY stands for the Primary Controller which can be a single mode or dual mode controller. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_conn.c | 2 +- net/bluetooth/hci_core.c | 28 +++++++++++++--------------- net/bluetooth/hci_event.c | 2 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/l2cap_core.c | 2 +- net/bluetooth/mgmt.c | 16 ++++++++-------- 6 files changed, 25 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index bf9f8a801a2e..3809617aa98d 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -625,7 +625,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src) list_for_each_entry(d, &hci_dev_list, list) { if (!test_bit(HCI_UP, &d->flags) || hci_dev_test_flag(d, HCI_USER_CHANNEL) || - d->dev_type != HCI_BREDR) + d->dev_type != HCI_PRIMARY) continue; /* Simple routing: diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 45a9fc68c677..98f6c3770736 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -260,14 +260,12 @@ static int hci_init1_req(struct hci_request *req, unsigned long opt) hci_reset_req(req, 0); switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: bredr_init(req); break; - case HCI_AMP: amp_init1(req); break; - default: BT_ERR("Unknown device type %d", hdev->dev_type); break; @@ -791,11 +789,11 @@ static int __hci_init(struct hci_dev *hdev) if (err < 0) return err; - /* HCI_BREDR covers both single-mode LE, BR/EDR and dual-mode + /* HCI_PRIMARY covers both single-mode LE, BR/EDR and dual-mode * BR/EDR/LE type controllers. AMP controllers only need the * first two stages of init. */ - if (hdev->dev_type != HCI_BREDR) + if (hdev->dev_type != HCI_PRIMARY) return 0; err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT, NULL); @@ -1202,7 +1200,7 @@ int hci_inquiry(void __user *arg) goto done; } - if (hdev->dev_type != HCI_BREDR) { + if (hdev->dev_type != HCI_PRIMARY) { err = -EOPNOTSUPP; goto done; } @@ -1307,7 +1305,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) * since AMP controllers do not have an address. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && - hdev->dev_type == HCI_BREDR && + hdev->dev_type == HCI_PRIMARY && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY)) { ret = -EADDRNOTAVAIL; @@ -1402,7 +1400,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT) && - hdev->dev_type == HCI_BREDR) { + hdev->dev_type == HCI_PRIMARY) { ret = __hci_req_hci_power_on(hdev); mgmt_power_on(hdev, ret); } @@ -1563,7 +1561,7 @@ int hci_dev_do_close(struct hci_dev *hdev) auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); - if (!auto_off && hdev->dev_type == HCI_BREDR && + if (!auto_off && hdev->dev_type == HCI_PRIMARY && hci_dev_test_flag(hdev, HCI_MGMT)) __mgmt_power_off(hdev); @@ -1802,7 +1800,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) goto done; } - if (hdev->dev_type != HCI_BREDR) { + if (hdev->dev_type != HCI_PRIMARY) { err = -EOPNOTSUPP; goto done; } @@ -2043,7 +2041,7 @@ static void hci_power_on(struct work_struct *work) */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || - (hdev->dev_type == HCI_BREDR && + (hdev->dev_type == HCI_PRIMARY && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); @@ -3030,7 +3028,7 @@ int hci_register_dev(struct hci_dev *hdev) * so the index can be used as the AMP controller ID. */ switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: id = ida_simple_get(&hci_index_ida, 0, 0, GFP_KERNEL); break; case HCI_AMP: @@ -3090,7 +3088,7 @@ int hci_register_dev(struct hci_dev *hdev) hci_dev_set_flag(hdev, HCI_SETUP); hci_dev_set_flag(hdev, HCI_AUTO_OFF); - if (hdev->dev_type == HCI_BREDR) { + if (hdev->dev_type == HCI_PRIMARY) { /* Assume BR/EDR support until proven otherwise (such as * through reading supported features during init. */ @@ -3415,7 +3413,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: hci_add_acl_hdr(skb, conn->handle, flags); break; case HCI_AMP: @@ -3826,7 +3824,7 @@ static void hci_sched_acl(struct hci_dev *hdev) BT_DBG("%s", hdev->name); /* No ACL link over BR/EDR controller */ - if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_BREDR) + if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_PRIMARY) return; /* No AMP link over AMP controller */ diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d4b3dd5413be..3fb95c47243c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3249,7 +3249,7 @@ static struct hci_conn *__hci_conn_lookup_handle(struct hci_dev *hdev, struct hci_chan *chan; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: return hci_conn_hash_lookup_handle(hdev, handle); case HCI_AMP: chan = hci_chan_lookup_handle(hdev, handle); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 12e9294b02e0..6ef8a01a9ad4 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -676,7 +676,7 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return -EOPNOTSUPP; - if (hdev->dev_type != HCI_BREDR) + if (hdev->dev_type != HCI_PRIMARY) return -EOPNOTSUPP; switch (cmd) { diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index eb4f5f24cbe3..54ceb1f2cc9a 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7468,7 +7468,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) int len; /* For AMP controller do not create l2cap conn */ - if (!conn && hcon->hdev->dev_type != HCI_BREDR) + if (!conn && hcon->hdev->dev_type != HCI_PRIMARY) goto drop; if (!conn) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 9e4b931588cf..7983ec8d4c60 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -359,7 +359,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && !hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } @@ -384,7 +384,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && !hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); BT_DBG("Added hci%u", d->id); @@ -419,7 +419,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } @@ -444,7 +444,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); BT_DBG("Added hci%u", d->id); @@ -479,7 +479,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_BREDR || d->dev_type == HCI_AMP) + if (d->dev_type == HCI_PRIMARY || d->dev_type == HCI_AMP) count++; } @@ -503,7 +503,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_BREDR) { + if (d->dev_type == HCI_PRIMARY) { if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) rp->entry[count].type = 0x01; else @@ -6366,7 +6366,7 @@ void mgmt_index_added(struct hci_dev *hdev) return; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS); @@ -6399,7 +6399,7 @@ void mgmt_index_removed(struct hci_dev *hdev) return; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { -- cgit v1.2.3 From a65056ecf4b48be0d0284a7b6a57b6dace10b843 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 6 Jul 2016 12:12:21 -0700 Subject: net: bridge: extend MLD/IGMP query stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As was suggested this patch adds support for the different versions of MLD and IGMP query types. Since the user visible structure is still in net-next we can augment it instead of adding netlink attributes. The distinction between the different IGMP/MLD query types is done as suggested in Section 7.1, RFC 3376 [1] and Section 8.1, RFC 3810 [2] based on query payload size and code for IGMP. Since all IGMP packets go through multicast_rcv() and it uses ip_mc_check_igmp/ipv6_mc_check_mld we can be sure that at least the ip/ipv6 header can be directly used. [1] https://tools.ietf.org/html/rfc3376#section-7 [2] https://tools.ietf.org/html/rfc3810#section-8.1 Suggested-by: Linus Lüssing Signed-off-by: Nikolay Aleksandrov Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 7 ++----- net/bridge/br_input.c | 2 +- net/bridge/br_multicast.c | 48 ++++++++++++++++++++++++++++++++++++----------- net/bridge/br_private.h | 5 +++-- 4 files changed, 43 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6c196037d818..d610644368b9 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -199,7 +199,6 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, bool unicast) { u8 igmp_type = br_multicast_igmp_type(skb); - __be16 proto = skb->protocol; struct net_bridge_port *prev; struct net_bridge_port *p; @@ -221,7 +220,7 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, if (IS_ERR(prev)) goto out; if (prev == p) - br_multicast_count(p->br, p, proto, igmp_type, + br_multicast_count(p->br, p, skb, igmp_type, BR_MCAST_DIR_TX); } @@ -266,8 +265,6 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; - __be16 proto = skb->protocol; - struct hlist_node *rp; rp = rcu_dereference(hlist_first_rcu(&br->router_list)); @@ -286,7 +283,7 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, if (IS_ERR(prev)) goto out; if (prev == port) - br_multicast_count(port->br, port, proto, igmp_type, + br_multicast_count(port->br, port, skb, igmp_type, BR_MCAST_DIR_TX); if ((unsigned long)lport >= (unsigned long)port) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 786602bc0567..a7817e6f306f 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -61,7 +61,7 @@ static int br_pass_frame_up(struct sk_buff *skb) if (!skb) return NET_RX_DROP; /* update the multicast stats if the packet is IGMP/MLD */ - br_multicast_count(br, NULL, skb->protocol, br_multicast_igmp_type(skb), + br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), BR_MCAST_DIR_TX); return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index e405eef0ae2e..a5423a1eec05 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -843,14 +843,14 @@ static void __br_multicast_send_query(struct net_bridge *br, if (port) { skb->dev = port->dev; - br_multicast_count(br, port, skb->protocol, igmp_type, + br_multicast_count(br, port, skb, igmp_type, BR_MCAST_DIR_TX); NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, dev_net(port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); } else { br_multicast_select_own_querier(br, ip, skb); - br_multicast_count(br, port, skb->protocol, igmp_type, + br_multicast_count(br, port, skb, igmp_type, BR_MCAST_DIR_RX); netif_rx(skb); } @@ -1676,7 +1676,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, if (skb_trimmed && skb_trimmed != skb) kfree_skb(skb_trimmed); - br_multicast_count(br, port, skb->protocol, BR_INPUT_SKB_CB(skb)->igmp, + br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; @@ -1725,7 +1725,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, if (skb_trimmed && skb_trimmed != skb) kfree_skb(skb_trimmed); - br_multicast_count(br, port, skb->protocol, BR_INPUT_SKB_CB(skb)->igmp, + br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; @@ -2251,13 +2251,16 @@ unlock: EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, - __be16 proto, u8 type, u8 dir) + const struct sk_buff *skb, u8 type, u8 dir) { struct bridge_mcast_stats *pstats = this_cpu_ptr(stats); + __be16 proto = skb->protocol; + unsigned int t_len; u64_stats_update_begin(&pstats->syncp); switch (proto) { case htons(ETH_P_IP): + t_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); switch (type) { case IGMP_HOST_MEMBERSHIP_REPORT: pstats->mstats.igmp_v1reports[dir]++; @@ -2269,7 +2272,21 @@ static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, pstats->mstats.igmp_v3reports[dir]++; break; case IGMP_HOST_MEMBERSHIP_QUERY: - pstats->mstats.igmp_queries[dir]++; + if (t_len != sizeof(struct igmphdr)) { + pstats->mstats.igmp_v3queries[dir]++; + } else { + unsigned int offset = skb_transport_offset(skb); + struct igmphdr *ih, _ihdr; + + ih = skb_header_pointer(skb, offset, + sizeof(_ihdr), &_ihdr); + if (!ih) + break; + if (!ih->code) + pstats->mstats.igmp_v1queries[dir]++; + else + pstats->mstats.igmp_v2queries[dir]++; + } break; case IGMP_HOST_LEAVE_MESSAGE: pstats->mstats.igmp_leaves[dir]++; @@ -2278,6 +2295,9 @@ static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): + t_len = ntohs(ipv6_hdr(skb)->payload_len) + + sizeof(struct ipv6hdr); + t_len -= skb_network_header_len(skb); switch (type) { case ICMPV6_MGM_REPORT: pstats->mstats.mld_v1reports[dir]++; @@ -2286,7 +2306,10 @@ static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, pstats->mstats.mld_v2reports[dir]++; break; case ICMPV6_MGM_QUERY: - pstats->mstats.mld_queries[dir]++; + if (t_len != sizeof(struct mld_msg)) + pstats->mstats.mld_v2queries[dir]++; + else + pstats->mstats.mld_v1queries[dir]++; break; case ICMPV6_MGM_REDUCTION: pstats->mstats.mld_leaves[dir]++; @@ -2299,7 +2322,7 @@ static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, } void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, - __be16 proto, u8 type, u8 dir) + const struct sk_buff *skb, u8 type, u8 dir) { struct bridge_mcast_stats __percpu *stats; @@ -2314,7 +2337,7 @@ void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, if (WARN_ON(!stats)) return; - br_mcast_stats_add(stats, proto, type, dir); + br_mcast_stats_add(stats, skb, type, dir); } int br_multicast_init_stats(struct net_bridge *br) @@ -2359,14 +2382,17 @@ void br_multicast_get_stats(const struct net_bridge *br, memcpy(&temp, &cpu_stats->mstats, sizeof(temp)); } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); - mcast_stats_add_dir(tdst.igmp_queries, temp.igmp_queries); + mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries); + mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries); + mcast_stats_add_dir(tdst.igmp_v3queries, temp.igmp_v3queries); mcast_stats_add_dir(tdst.igmp_leaves, temp.igmp_leaves); mcast_stats_add_dir(tdst.igmp_v1reports, temp.igmp_v1reports); mcast_stats_add_dir(tdst.igmp_v2reports, temp.igmp_v2reports); mcast_stats_add_dir(tdst.igmp_v3reports, temp.igmp_v3reports); tdst.igmp_parse_errors += temp.igmp_parse_errors; - mcast_stats_add_dir(tdst.mld_queries, temp.mld_queries); + mcast_stats_add_dir(tdst.mld_v1queries, temp.mld_v1queries); + mcast_stats_add_dir(tdst.mld_v2queries, temp.mld_v2queries); mcast_stats_add_dir(tdst.mld_leaves, temp.mld_leaves); mcast_stats_add_dir(tdst.mld_v1reports, temp.mld_v1reports); mcast_stats_add_dir(tdst.mld_v2reports, temp.mld_v2reports); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 4dc851166ad1..40f200947ddc 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -586,7 +586,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, int type); void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, - __be16 proto, u8 type, u8 dir); + const struct sk_buff *skb, u8 type, u8 dir); int br_multicast_init_stats(struct net_bridge *br); void br_multicast_get_stats(const struct net_bridge *br, const struct net_bridge_port *p, @@ -719,7 +719,8 @@ static inline void br_mdb_uninit(void) static inline void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, - __be16 proto, u8 type, u8 dir) + const struct sk_buff *skb, + u8 type, u8 dir) { } -- cgit v1.2.3 From 8afe97e5d4165c9d181d42504af3f96c8427659a Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 7 Jul 2016 07:56:12 +0200 Subject: tunnels: support MPLS over IPv4 tunnels Extend tunnel support to MPLS over IPv4. The implementation extends the existing differentiation between IPIP and IPv6 over IPv4 to also cover MPLS over IPv4. Signed-off-by: Simon Horman Reviewed-by: Dinan Gunawardena Signed-off-by: David S. Miller --- net/ipv4/tunnel4.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 0d0171830620..45cd4253583a 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,11 +17,14 @@ static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; +static struct xfrm_tunnel __rcu *tunnelmpls4_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) { - return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; + return (family == AF_INET) ? &tunnel4_handlers : + (family == AF_INET6) ? &tunnel64_handlers : + &tunnelmpls4_handlers; } int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) @@ -125,6 +129,26 @@ drop: } #endif +#if IS_ENABLED(CONFIG_MPLS) +static int tunnelmpls4_rcv(struct sk_buff *skb) +{ + struct xfrm_tunnel *handler; + + if (!pskb_may_pull(skb, sizeof(struct mpls_label))) + goto drop; + + for_each_tunnel_rcu(tunnelmpls4_handlers, handler) + if (!handler->handler(skb)) + return 0; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} +#endif + static void tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; @@ -145,6 +169,17 @@ static void tunnel64_err(struct sk_buff *skb, u32 info) } #endif +#if IS_ENABLED(CONFIG_MPLS) +static void tunnelmpls4_err(struct sk_buff *skb, u32 info) +{ + struct xfrm_tunnel *handler; + + for_each_tunnel_rcu(tunnelmpls4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} +#endif + static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, @@ -161,24 +196,46 @@ static const struct net_protocol tunnel64_protocol = { }; #endif +#if IS_ENABLED(CONFIG_MPLS) +static const struct net_protocol tunnelmpls4_protocol = { + .handler = tunnelmpls4_rcv, + .err_handler = tunnelmpls4_err, + .no_policy = 1, + .netns_ok = 1, +}; +#endif + static int __init tunnel4_init(void) { - if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) { - pr_err("%s: can't add protocol\n", __func__); - return -EAGAIN; - } + if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) + goto err_ipip; #if IS_ENABLED(CONFIG_IPV6) - if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { - pr_err("tunnel64 init: can't add protocol\n"); - inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); - return -EAGAIN; - } + if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) + goto err_ipv6; +#endif +#if IS_ENABLED(CONFIG_MPLS) + if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) + goto err_mpls; #endif return 0; + +#if IS_ENABLED(CONFIG_IPV6) +err_mpls: + inet_del_protocol(&tunnel4_protocol, IPPROTO_IPV6); +#endif +err_ipv6: + inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); +err_ipip: + pr_err("%s: can't add protocol\n", __func__); + return -EAGAIN; } static void __exit tunnel4_fini(void) { +#if IS_ENABLED(CONFIG_MPLS) + if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) + pr_err("tunnelmpls4 close: can't remove protocol\n"); +#endif #if IS_ENABLED(CONFIG_IPV6) if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) pr_err("tunnel64 close: can't remove protocol\n"); -- cgit v1.2.3 From 49dbe7ae2168b3a933ecea1118fc0515c186bd64 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 7 Jul 2016 07:56:13 +0200 Subject: sit: support MPLS over IPv4 Extend the SIT driver to support MPLS over IPv4. This implementation extends existing support for IPv6 over IPv4 and IPv4 over IPv4. Signed-off-by: Simon Horman Reviewed-by: Dinan Gunawardena Signed-off-by: David S. Miller --- net/ipv6/sit.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 77 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 917a5cd4b8fc..182b6a9be29d 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -688,12 +688,19 @@ out: return 0; } -static const struct tnl_ptk_info tpi = { +static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; -static int ipip_rcv(struct sk_buff *skb) +#if IS_ENABLED(CONFIG_MPLS) +static const struct tnl_ptk_info mplsip_tpi = { + /* no tunnel info required for mplsip. */ + .proto = htons(ETH_P_MPLS_UC), +}; +#endif + +static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { const struct iphdr *iph; struct ip_tunnel *tunnel; @@ -702,15 +709,23 @@ static int ipip_rcv(struct sk_buff *skb) tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); if (tunnel) { - if (tunnel->parms.iph.protocol != IPPROTO_IPIP && + const struct tnl_ptk_info *tpi; + + if (tunnel->parms.iph.protocol != ipproto && tunnel->parms.iph.protocol != 0) goto drop; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto, false)) +#if IS_ENABLED(CONFIG_MPLS) + if (ipproto == IPPROTO_MPLS) + tpi = &mplsip_tpi; + else +#endif + tpi = &ipip_tpi; + if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); + return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } return 1; @@ -720,6 +735,18 @@ drop: return 0; } +static int ipip_rcv(struct sk_buff *skb) +{ + return sit_tunnel_rcv(skb, IPPROTO_IPIP); +} + +#if IS_ENABLED(CONFIG_MPLS) +static int mplsip_rcv(struct sk_buff *skb) +{ + return sit_tunnel_rcv(skb, IPPROTO_MPLS); +} +#endif + /* * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function * stores the embedded IPv4 address in v4dst and returns true. @@ -958,7 +985,8 @@ tx_error: return NETDEV_TX_OK; } -static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb, + struct net_device *dev, u8 ipproto) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; @@ -966,9 +994,9 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; - skb_set_inner_ipproto(skb, IPPROTO_IPIP); + skb_set_inner_ipproto(skb, ipproto); - ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP); + ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: kfree_skb(skb); @@ -981,11 +1009,16 @@ static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, { switch (skb->protocol) { case htons(ETH_P_IP): - ipip_tunnel_xmit(skb, dev); + sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP); break; case htons(ETH_P_IPV6): ipip6_tunnel_xmit(skb, dev); break; +#if IS_ENABLED(CONFIG_MPLS) + case htons(ETH_P_MPLS_UC): + sit_tunnel_xmit__(skb, dev, IPPROTO_MPLS); + break; +#endif default: goto tx_err; } @@ -1093,6 +1126,16 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, } #endif +bool ipip6_valid_ip_proto(u8 ipproto) +{ + return ipproto == IPPROTO_IPV6 || + ipproto == IPPROTO_IPIP || +#if IS_ENABLED(CONFIG_MPLS) + ipproto == IPPROTO_MPLS || +#endif + ipproto == 0; +} + static int ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -1152,9 +1195,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) goto done; err = -EINVAL; - if (p.iph.protocol != IPPROTO_IPV6 && - p.iph.protocol != IPPROTO_IPIP && - p.iph.protocol != 0) + if (!ipip6_valid_ip_proto(p.iph.protocol)) goto done; if (p.iph.version != 4 || p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) @@ -1379,9 +1420,7 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); - if (proto != IPPROTO_IPV6 && - proto != IPPROTO_IPIP && - proto != 0) + if (!ipip6_valid_ip_proto(proto)) return -EINVAL; return 0; @@ -1723,6 +1762,14 @@ static struct xfrm_tunnel ipip_handler __read_mostly = { .priority = 2, }; +#if IS_ENABLED(CONFIG_MPLS) +static struct xfrm_tunnel mplsip_handler __read_mostly = { + .handler = mplsip_rcv, + .err_handler = ipip6_err, + .priority = 2, +}; +#endif + static void __net_exit sit_destroy_tunnels(struct net *net, struct list_head *head) { @@ -1818,6 +1865,9 @@ static void __exit sit_cleanup(void) rtnl_link_unregister(&sit_link_ops); xfrm4_tunnel_deregister(&sit_handler, AF_INET6); xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +#if IS_ENABLED(CONFIG_MPLS) + xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); +#endif unregister_pernet_device(&sit_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ @@ -1827,7 +1877,7 @@ static int __init sit_init(void) { int err; - pr_info("IPv6 over IPv4 tunneling driver\n"); + pr_info("IPv6, IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&sit_net_ops); if (err < 0) @@ -1842,6 +1892,13 @@ static int __init sit_init(void) pr_info("%s: can't register ip4ip4\n", __func__); goto xfrm_tunnel4_failed; } +#if IS_ENABLED(CONFIG_MPLS) + err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); + if (err < 0) { + pr_info("%s: can't register mplsip\n", __func__); + goto xfrm_tunnel_mpls_failed; + } +#endif err = rtnl_link_register(&sit_link_ops); if (err < 0) goto rtnl_link_failed; @@ -1850,6 +1907,10 @@ out: return err; rtnl_link_failed: +#if IS_ENABLED(CONFIG_MPLS) + xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); +xfrm_tunnel_mpls_failed: +#endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_tunnel4_failed: xfrm4_tunnel_deregister(&sit_handler, AF_INET6); -- cgit v1.2.3 From 1b69e7e6c4da1e84edc2496fa91db289e5e493b0 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 7 Jul 2016 07:56:14 +0200 Subject: ipip: support MPLS over IPv4 Extend the IPIP driver to support MPLS over IPv4. The implementation is an extension of existing support for IPv4 over IPv4 and is based of multiple inner-protocol support for the SIT driver. Signed-off-by: Simon Horman Reviewed-by: Dinan Gunawardena Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 137 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 121 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 978370132f29..4ae3f8e6c6cc 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -148,14 +148,14 @@ static int ipip_err(struct sk_buff *skb, u32 info) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->parms.link, 0, IPPROTO_IPIP, 0); + t->parms.link, 0, iph->protocol, 0); err = 0; goto out; } if (type == ICMP_REDIRECT) { ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, - IPPROTO_IPIP, 0); + iph->protocol, 0); err = 0; goto out; } @@ -177,12 +177,19 @@ out: return err; } -static const struct tnl_ptk_info tpi = { +static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; -static int ipip_rcv(struct sk_buff *skb) +#if IS_ENABLED(CONFIG_MPLS) +static const struct tnl_ptk_info mplsip_tpi = { + /* no tunnel info required for mplsip. */ + .proto = htons(ETH_P_MPLS_UC), +}; +#endif + +static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); @@ -193,11 +200,23 @@ static int ipip_rcv(struct sk_buff *skb) tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel) { + const struct tnl_ptk_info *tpi; + + if (tunnel->parms.iph.protocol != ipproto && + tunnel->parms.iph.protocol != 0) + goto drop; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto, false)) +#if IS_ENABLED(CONFIG_MPLS) + if (ipproto == IPPROTO_MPLS) + tpi = &mplsip_tpi; + else +#endif + tpi = &ipip_tpi; + if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); + return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } return -1; @@ -207,24 +226,51 @@ drop: return 0; } +static int ipip_rcv(struct sk_buff *skb) +{ + return ipip_tunnel_rcv(skb, IPPROTO_IPIP); +} + +#if IS_ENABLED(CONFIG_MPLS) +static int mplsip_rcv(struct sk_buff *skb) +{ + return ipip_tunnel_rcv(skb, IPPROTO_MPLS); +} +#endif + /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ -static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; + u8 ipproto; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ipproto = IPPROTO_IPIP; + break; +#if IS_ENABLED(CONFIG_MPLS) + case htons(ETH_P_MPLS_UC): + ipproto = IPPROTO_MPLS; + break; +#endif + default: + goto tx_error; + } - if (unlikely(skb->protocol != htons(ETH_P_IP))) + if (tiph->protocol != ipproto && tiph->protocol != 0) goto tx_error; if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; - skb_set_inner_ipproto(skb, IPPROTO_IPIP); + skb_set_inner_ipproto(skb, ipproto); - ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); + ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: @@ -234,6 +280,20 @@ tx_error: return NETDEV_TX_OK; } +static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) +{ + switch (ipproto) { + case 0: + case IPPROTO_IPIP: +#if IS_ENABLED(CONFIG_MPLS) + case IPPROTO_MPLS: +#endif + return true; + } + + return false; +} + static int ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -244,7 +304,8 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EFAULT; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || + if (p.iph.version != 4 || + !ipip_tunnel_ioctl_verify_protocol(p.iph.protocol) || p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) return -EINVAL; } @@ -301,10 +362,23 @@ static int ipip_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 0; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; - tunnel->parms.iph.protocol = IPPROTO_IPIP; return ip_tunnel_init(dev); } +static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + u8 proto; + + if (!data || !data[IFLA_IPTUN_PROTO]) + return 0; + + proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); + if (proto != IPPROTO_IPIP && proto != IPPROTO_MPLS && proto != 0) + return -EINVAL; + + return 0; +} + static void ipip_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm *parms) { @@ -335,6 +409,9 @@ static void ipip_netlink_parms(struct nlattr *data[], if (data[IFLA_IPTUN_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); + if (data[IFLA_IPTUN_PROTO]) + parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); + if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); } @@ -427,6 +504,8 @@ static size_t ipip_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_IPTUN_TOS */ nla_total_size(1) + + /* IFLA_IPTUN_PROTO */ + nla_total_size(1) + /* IFLA_IPTUN_PMTUDISC */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ @@ -450,6 +529,7 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || + nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF)))) goto nla_put_failure; @@ -476,6 +556,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, + [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, @@ -489,6 +570,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = { .policy = ipip_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipip_tunnel_setup, + .validate = ipip_tunnel_validate, .newlink = ipip_newlink, .changelink = ipip_changelink, .dellink = ip_tunnel_dellink, @@ -503,6 +585,14 @@ static struct xfrm_tunnel ipip_handler __read_mostly = { .priority = 1, }; +#if IS_ENABLED(CONFIG_MPLS) +static struct xfrm_tunnel mplsip_handler __read_mostly = { + .handler = mplsip_rcv, + .err_handler = ipip_err, + .priority = 1, +}; +#endif + static int __net_init ipip_init_net(struct net *net) { return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); @@ -525,7 +615,7 @@ static int __init ipip_init(void) { int err; - pr_info("ipip: IPv4 over IPv4 tunneling driver\n"); + pr_info("ipip: IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&ipip_net_ops); if (err < 0) @@ -533,8 +623,15 @@ static int __init ipip_init(void) err = xfrm4_tunnel_register(&ipip_handler, AF_INET); if (err < 0) { pr_info("%s: can't register tunnel\n", __func__); - goto xfrm_tunnel_failed; + goto xfrm_tunnel_ipip_failed; + } +#if IS_ENABLED(CONFIG_MPLS) + err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); + if (err < 0) { + pr_info("%s: can't register tunnel\n", __func__); + goto xfrm_tunnel_mplsip_failed; } +#endif err = rtnl_link_register(&ipip_link_ops); if (err < 0) goto rtnl_link_failed; @@ -543,8 +640,13 @@ out: return err; rtnl_link_failed: +#if IS_ENABLED(CONFIG_MPLS) + xfrm4_tunnel_deregister(&mplsip_handler, AF_INET); +xfrm_tunnel_mplsip_failed: + +#endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); -xfrm_tunnel_failed: +xfrm_tunnel_ipip_failed: unregister_pernet_device(&ipip_net_ops); goto out; } @@ -554,7 +656,10 @@ static void __exit ipip_fini(void) rtnl_link_unregister(&ipip_link_ops); if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) pr_info("%s: can't deregister tunnel\n", __func__); - +#if IS_ENABLED(CONFIG_MPLS) + if (xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS)) + pr_info("%s: can't deregister tunnel\n", __func__); +#endif unregister_pernet_device(&ipip_net_ops); } -- cgit v1.2.3 From 407f31be9ddfbcc51ae8054c1218db00c08b92e9 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 7 Jul 2016 07:56:15 +0200 Subject: mpls: allow routes on ipip and sit devices Allow MPLS routes on IPIP and SIT devices now that they support forwarding MPLS packets. Signed-off-by: Simon Horman Reviewed-by: Dinan Gunawardena Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index e9beaa58573c..5c161e7759b5 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1009,10 +1009,12 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, unsigned int flags; if (event == NETDEV_REGISTER) { - /* For now just support Ethernet and IPGRE devices */ + /* For now just support Ethernet, IPGRE, SIT and IPIP devices */ if (dev->type == ARPHRD_ETHER || dev->type == ARPHRD_LOOPBACK || - dev->type == ARPHRD_IPGRE) { + dev->type == ARPHRD_IPGRE || + dev->type == ARPHRD_SIT || + dev->type == ARPHRD_TUNNEL) { mdev = mpls_add_dev(dev); if (IS_ERR(mdev)) return notifier_from_errno(PTR_ERR(mdev)); -- cgit v1.2.3 From 1db19db7f5ff4ddd3b1b6dd2092a87298ee5bd0b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 7 Jul 2016 18:01:32 +0200 Subject: net: tracepoint napi:napi_poll add work and budget An important information for the napi_poll tracepoint is knowing the work done (packets processed) by the napi_poll() call. Add both the work done and budget, as they are related. Handle trace_napi_poll() param change in dropwatch/drop_monitor and in python perf script netdev-times.py in backward compat way, as python fortunately supports optional parameter handling. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- net/core/drop_monitor.c | 3 ++- net/core/netpoll.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index b92d63bfde7a..7894e406c806 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4972,7 +4972,7 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (test_bit(NAPI_STATE_SCHED, &napi->state)) { rc = napi->poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi); + trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); if (rc == BUSY_POLL_BUDGET) { napi_complete_done(napi, rc); napi_schedule(napi); @@ -5128,7 +5128,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) work = 0; if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); - trace_napi_poll(n); + trace_napi_poll(n, work, weight); } WARN_ON_ONCE(work > weight); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 252e155c837b..d6b3b579560d 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -187,7 +187,8 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *locatio trace_drop_common(skb, location); } -static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi) +static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, + int work, int budget) { struct dm_hw_stat_delta *new_stat; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 94acfc89ad97..53599bd0c82d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -163,7 +163,7 @@ static void poll_one_napi(struct napi_struct *napi) */ work = napi->poll(napi, 0); WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll); - trace_napi_poll(napi); + trace_napi_poll(napi, work, 0); clear_bit(NAPI_STATE_NPSVC, &napi->state); } -- cgit v1.2.3 From fa17806cde76fb1087532f07e72aa757a30e0500 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Jul 2016 05:18:24 +0200 Subject: ipv4: do not abuse GFP_ATOMIC in inet_netconf_notify_devconf() inet_forward_change() runs with RTNL held. We are allowed to sleep if required. If we use __in_dev_get_rtnl() instead of __in_dev_get_rcu(), we no longer have to use GFP_ATOMIC allocations in inet_netconf_notify_devconf(), meaning we are less likely to miss notifications under memory pressure, and wont touch precious memory reserves either and risk dropping incoming packets. inet_netconf_get_devconf() can also use GFP_KERNEL allocation. Fixes: edc9e748934c ("rtnl/ipv4: use netconf msg to advertise forwarding status") Fixes: 9e5511106f99 ("rtnl/ipv4: add support of RTM_GETNETCONF") Signed-off-by: Eric Dumazet Cc: Nicolas Dichtel Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e333bc86bd39..415e117967c7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1834,7 +1834,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC); + skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; @@ -1846,7 +1846,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL); return; errout: if (err < 0) @@ -1903,7 +1903,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC); + skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; @@ -2027,16 +2027,16 @@ static void inet_forward_change(struct net *net) for_each_netdev(net, dev) { struct in_device *in_dev; + if (on) dev_disable_lro(dev); - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); + + in_dev = __in_dev_get_rtnl(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, dev->ifindex, &in_dev->cnf); } - rcu_read_unlock(); } } -- cgit v1.2.3 From 927265bc6cd6374c9bafc43408ece4e92311b149 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Jul 2016 05:46:04 +0200 Subject: ipv6: do not abuse GFP_ATOMIC in inet6_netconf_notify_devconf() All inet6_netconf_notify_devconf() callers are in process context, so we can use GFP_KERNEL allocations if we take care of not holding a rwlock while not needed in ip6mr (we hold RTNL there) Fixes: d67b8c616b48 ("netconf: advertise mc_forwarding status") Fixes: f3a1bfb11ccb ("rtnl/ipv6: use netconf msg to advertise forwarding status") Signed-off-by: Eric Dumazet Cc: Nicolas Dichtel Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++-- net/ipv6/ip6mr.c | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a1f6b7b31531..24f1b0898e40 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -547,7 +547,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_ATOMIC); + skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; @@ -559,7 +559,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 487ef3bc7bbc..c7ca0f5d1a3b 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1592,14 +1592,15 @@ static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk) if (likely(mrt->mroute6_sk == NULL)) { mrt->mroute6_sk = sk; net->ipv6.devconf_all->mc_forwarding++; - inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, - NETCONFA_IFINDEX_ALL, - net->ipv6.devconf_all); - } - else + } else { err = -EADDRINUSE; + } write_unlock_bh(&mrt_lock); + if (!err) + inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); rtnl_unlock(); return err; @@ -1617,11 +1618,11 @@ int ip6mr_sk_done(struct sock *sk) write_lock_bh(&mrt_lock); mrt->mroute6_sk = NULL; net->ipv6.devconf_all->mc_forwarding--; + write_unlock_bh(&mrt_lock); inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); - write_unlock_bh(&mrt_lock); mroute_clean_tables(mrt, false); err = 0; -- cgit v1.2.3 From 1d984c2e03c1fb21539a9f50627e312788512013 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Fri, 8 Jul 2016 15:52:39 +0200 Subject: NFC: digital: Fix handling of saved PDU sk_buff pointers This patch fixes the way an I-PDU is saved in case it needs to be sent again. It is now copied using pskb_copy() and not simply referenced using skb_get() since it could be modified by the driver. digital_in_send_saved_skb() and digital_tg_send_saved_skb() still get a reference on the saved skb which is re-sent but release it if the send operation fails. That way the caller doesn't have to take care about skb ref in case of error. RTOX supervisor PDU must not be saved as this can override a previously saved I-PDU that should be re-sent later on. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_dep.c | 59 +++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index b62c85dc12a2..804585cb3f8e 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -524,8 +524,7 @@ static int digital_in_send_ack(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - ddev->saved_skb = skb_get(skb); - ddev->saved_skb_len = skb->len; + ddev->saved_skb = pskb_copy(skb, GFP_KERNEL); rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, data_exch); @@ -627,16 +626,10 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - ddev->saved_skb = skb_get(skb); - ddev->saved_skb_len = skb->len; - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, data_exch); - if (rc) { + if (rc) kfree_skb(skb); - kfree_skb(ddev->saved_skb); - ddev->saved_skb = NULL; - } return rc; } @@ -644,11 +637,19 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev, static int digital_in_send_saved_skb(struct nfc_digital_dev *ddev, struct digital_data_exch *data_exch) { + int rc; + + if (!ddev->saved_skb) + return -EINVAL; + skb_get(ddev->saved_skb); - skb_push(ddev->saved_skb, ddev->saved_skb_len); - return digital_in_send_cmd(ddev, ddev->saved_skb, 1500, - digital_in_recv_dep_res, data_exch); + rc = digital_in_send_cmd(ddev, ddev->saved_skb, 1500, + digital_in_recv_dep_res, data_exch); + if (rc) + kfree_skb(ddev->saved_skb); + + return rc; } static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, @@ -812,17 +813,12 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU: if (!DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb)) { /* ATN */ rc = digital_in_send_saved_skb(ddev, data_exch); - if (rc) { - kfree_skb(ddev->saved_skb); + if (rc) goto error; - } return; } - kfree_skb(ddev->saved_skb); - ddev->saved_skb = NULL; - rc = digital_in_send_rtox(ddev, data_exch, resp->data[0]); if (rc) goto error; @@ -876,8 +872,7 @@ int digital_in_send_dep_req(struct nfc_digital_dev *ddev, ddev->skb_add_crc(tmp_skb); - ddev->saved_skb = skb_get(tmp_skb); - ddev->saved_skb_len = tmp_skb->len; + ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL); rc = digital_in_send_cmd(ddev, tmp_skb, 1500, digital_in_recv_dep_res, data_exch); @@ -956,8 +951,7 @@ static int digital_tg_send_ack(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - ddev->saved_skb = skb_get(skb); - ddev->saved_skb_len = skb->len; + ddev->saved_skb = pskb_copy(skb, GFP_KERNEL); rc = digital_tg_send_cmd(ddev, skb, 1500, digital_tg_recv_dep_req, data_exch); @@ -1009,11 +1003,19 @@ static int digital_tg_send_atn(struct nfc_digital_dev *ddev) static int digital_tg_send_saved_skb(struct nfc_digital_dev *ddev) { + int rc; + + if (!ddev->saved_skb) + return -EINVAL; + skb_get(ddev->saved_skb); - skb_push(ddev->saved_skb, ddev->saved_skb_len); - return digital_tg_send_cmd(ddev, ddev->saved_skb, 1500, - digital_tg_recv_dep_req, NULL); + rc = digital_tg_send_cmd(ddev, ddev->saved_skb, 1500, + digital_tg_recv_dep_req, NULL); + if (rc) + kfree_skb(ddev->saved_skb); + + return rc; } static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, @@ -1163,10 +1165,8 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, ddev->atn_count = 0; rc = digital_tg_send_saved_skb(ddev); - if (rc) { - kfree_skb(ddev->saved_skb); + if (rc) goto exit; - } } return; @@ -1235,8 +1235,7 @@ int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb) ddev->skb_add_crc(tmp_skb); - ddev->saved_skb = skb_get(tmp_skb); - ddev->saved_skb_len = tmp_skb->len; + ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL); rc = digital_tg_send_cmd(ddev, tmp_skb, 1500, digital_tg_recv_dep_req, NULL); -- cgit v1.2.3 From e8e7f4217564fc115b60a9373646afb193aa08cf Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Fri, 8 Jul 2016 15:52:40 +0200 Subject: NFC: digital: Remove useless call to skb_reserve() When allocating chained I-PDUs, there is no need to call skb_reserve() since it's already done by digital_alloc_skb() and contains enough room for the driver head and tail data. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_dep.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index 804585cb3f8e..ed3a52971d65 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -190,8 +190,6 @@ digital_send_dep_data_prep(struct nfc_digital_dev *ddev, struct sk_buff *skb, return ERR_PTR(-ENOMEM); } - skb_reserve(new_skb, ddev->tx_headroom + NFC_HEADER_SIZE + - DIGITAL_NFC_DEP_REQ_RES_HEADROOM); memcpy(skb_put(new_skb, ddev->remote_payload_max), skb->data, ddev->remote_payload_max); skb_pull(skb, ddev->remote_payload_max); -- cgit v1.2.3 From f23a9868b1c45e77ec6082eb95508885111ffda1 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Fri, 8 Jul 2016 15:52:41 +0200 Subject: NFC: digital: Fix target DEP_REQ I-PDU handling after ATN PDU When the initiator sends a DEP_REQ I-PDU, the target device may not reply in a timely manner. In this case the initiator device must send an attention PDU (ATN) and if the recipient replies with an ATN PDU in return, then the last I-PDU must be sent again by the initiator. This patch fixes how the target handles I-PDU received after an ATN PDU has been received. There are 2 possible cases: - The target has received the initial DEP_REQ and sends back the DEP_RES but the initiator did not receive it. In this case, after the initiator has sent an ATN PDU and the target replied it (with an ATN as well), the initiator sends the saved skb of the initial DEP_REQ again and the target replies with the saved skb of the initial DEP_RES. - Or the target did not even received the initial DEP_REQ. In this case, after the ATN PDUs exchange, the initiator sends the saved skb and the target simply passes it up, just as usual. This behavior is controlled using the atn_count and the PNI field of the digital device structure. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_dep.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index ed3a52971d65..1778c23751d4 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -1086,22 +1086,38 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, case DIGITAL_NFC_DEP_PFB_I_PDU: pr_debug("DIGITAL_NFC_DEP_PFB_I_PDU\n"); - if ((ddev->atn_count && (DIGITAL_NFC_DEP_PFB_PNI(pfb - 1) != - ddev->curr_nfc_dep_pni)) || - (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni)) { - PROTOCOL_ERR("14.12.3.4"); - rc = -EIO; - goto exit; - } - if (ddev->atn_count) { + /* The target has received (and replied to) at least one + * ATN DEP_REQ. + */ ddev->atn_count = 0; - rc = digital_tg_send_saved_skb(ddev); - if (rc) - goto exit; + /* pni of resp PDU equal to the target current pni - 1 + * means resp is the previous DEP_REQ PDU received from + * the initiator so the target replies with saved_skb + * which is the previous DEP_RES saved in + * digital_tg_send_dep_res(). + */ + if (DIGITAL_NFC_DEP_PFB_PNI(pfb) == + DIGITAL_NFC_DEP_PFB_PNI(ddev->curr_nfc_dep_pni - 1)) { + rc = digital_tg_send_saved_skb(ddev); + if (rc) + goto exit; - return; + goto free_resp; + } + + /* atn_count > 0 and PDU pni != curr_nfc_dep_pni - 1 + * means the target probably did not received the last + * DEP_REQ PDU sent by the initiator. The target + * fallbacks to normal processing then. + */ + } + + if (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni) { + PROTOCOL_ERR("14.12.3.4"); + rc = -EIO; + goto exit; } kfree_skb(ddev->saved_skb); @@ -1197,6 +1213,11 @@ exit: if (rc) kfree_skb(resp); + + return; + +free_resp: + dev_kfree_skb(resp); } int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb) -- cgit v1.2.3 From 482333b277de181ce80c833d84f2598e2527b267 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Fri, 8 Jul 2016 15:52:42 +0200 Subject: NFC: digital: Fix ACK & NACK PDUs handling in target mode When the target receives a NACK PDU, it re-sends the last sent PDU. ACK PDUs are received by the target as a reply from the initiator to chained I-PDUs. There are 3 cases to handle: - If the target has previously received 1 or more ATN PDUs and the PNI in the ACK PDU is equal to the target PNI - 1, then it means that the initiator did not received the last issued PDU from the target. In this case it re-sends this PDU. - If the target has received 1 or more ATN PDUs but the ACK PNI is not the target PNI - 1, then this means that this ACK is the reply of the previous chained I-PDU sent by the target. The target did not received it on the first attempt and it is being re-sent by the initiator. The process continues as usual. - No ATN PDU received before this ACK PDU. This is the reply of a chained I-PDU. The target keeps on processing its chained I-PDU. The code has been refactored to avoid too many indentation levels. Also, ACK and NACK PDUs were not freed. This is now fixed. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_dep.c | 71 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index 1778c23751d4..e0268777ab18 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -1141,49 +1141,64 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, rc = 0; break; case DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU: - if (!DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) { /* ACK */ - if ((ddev->atn_count && - (DIGITAL_NFC_DEP_PFB_PNI(pfb - 1) != - ddev->curr_nfc_dep_pni)) || - (DIGITAL_NFC_DEP_PFB_PNI(pfb) != - ddev->curr_nfc_dep_pni) || - !ddev->chaining_skb || !ddev->saved_skb) { + if (DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) { /* NACK */ + if (DIGITAL_NFC_DEP_PFB_PNI(pfb + 1) != + ddev->curr_nfc_dep_pni) { rc = -EIO; goto exit; } - if (ddev->atn_count) { - ddev->atn_count = 0; + ddev->atn_count = 0; + + rc = digital_tg_send_saved_skb(ddev); + if (rc) + goto exit; + + goto free_resp; + } + + /* ACK */ + if (ddev->atn_count) { + /* The target has previously recevied one or more ATN + * PDUs. + */ + ddev->atn_count = 0; + /* If the ACK PNI is equal to the target PNI - 1 means + * that the initiator did not receive the previous PDU + * sent by the target so re-send it. + */ + if (DIGITAL_NFC_DEP_PFB_PNI(pfb + 1) == + ddev->curr_nfc_dep_pni) { rc = digital_tg_send_saved_skb(ddev); if (rc) goto exit; - return; + goto free_resp; } - kfree_skb(ddev->saved_skb); - ddev->saved_skb = NULL; + /* Otherwise, the target did not receive the previous + * ACK PDU from the initiator. Fallback to normal + * processing of chained PDU then. + */ + } - rc = digital_tg_send_dep_res(ddev, ddev->chaining_skb); - if (rc) - goto exit; - } else { /* NACK */ - if ((DIGITAL_NFC_DEP_PFB_PNI(pfb + 1) != - ddev->curr_nfc_dep_pni) || - !ddev->saved_skb) { - rc = -EIO; - goto exit; - } + /* Keep on sending chained PDU */ + if (!ddev->chaining_skb || + DIGITAL_NFC_DEP_PFB_PNI(pfb) != + ddev->curr_nfc_dep_pni) { + rc = -EIO; + goto exit; + } - ddev->atn_count = 0; + kfree_skb(ddev->saved_skb); + ddev->saved_skb = NULL; - rc = digital_tg_send_saved_skb(ddev); - if (rc) - goto exit; - } + rc = digital_tg_send_dep_res(ddev, ddev->chaining_skb); + if (rc) + goto exit; - return; + goto free_resp; case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU: if (DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb)) { rc = -EINVAL; -- cgit v1.2.3 From e073eb6797191abe2fe30ca643ab0cc3d8e1e534 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Fri, 8 Jul 2016 15:52:43 +0200 Subject: NFC: digital: Rework ACK PDU handling in initiator mode With this patch, ACK PDU sk_buffs are now freed and code has been refactored for better errors handling. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_dep.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index e0268777ab18..03bfc74745f7 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -782,6 +782,12 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, break; case DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU: + if (DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) { + PROTOCOL_ERR("14.12.4.5"); + rc = -EIO; + goto exit; + } + if (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni) { PROTOCOL_ERR("14.12.3.3"); rc = -EIO; @@ -791,22 +797,25 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, ddev->curr_nfc_dep_pni = DIGITAL_NFC_DEP_PFB_PNI(ddev->curr_nfc_dep_pni + 1); - if (ddev->chaining_skb && !DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) { - kfree_skb(ddev->saved_skb); - ddev->saved_skb = NULL; + if (!ddev->chaining_skb) { + PROTOCOL_ERR("14.12.4.3"); + rc = -EIO; + goto exit; + } - rc = digital_in_send_dep_req(ddev, NULL, - ddev->chaining_skb, - ddev->data_exch); - if (rc) - goto error; + /* The initiator has received a valid ACK. Free the last sent + * PDU and keep on sending chained skb. + */ + kfree_skb(ddev->saved_skb); + ddev->saved_skb = NULL; - return; - } + rc = digital_in_send_dep_req(ddev, NULL, + ddev->chaining_skb, + ddev->data_exch); + if (rc) + goto error; - pr_err("Received a ACK/NACK PDU\n"); - rc = -EINVAL; - goto exit; + goto free_resp; case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU: if (!DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb)) { /* ATN */ @@ -839,6 +848,11 @@ error: if (rc) kfree_skb(resp); + + return; + +free_resp: + dev_kfree_skb(resp); } int digital_in_send_dep_req(struct nfc_digital_dev *ddev, -- cgit v1.2.3 From e200f008ace69eebac0a1432dc9e24ab5cd0d029 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Fri, 8 Jul 2016 15:52:44 +0200 Subject: NFC: digital: Free supervisor PDUs This patch frees the RTOX resp sk_buff in initiator mode. It also makes use of the free_resp exit point for ATN supervisor PDUs in both initiator and target mode. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_dep.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index 03bfc74745f7..ba52a5dbf3cc 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -823,15 +823,14 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, if (rc) goto error; - return; + goto free_resp; } rc = digital_in_send_rtox(ddev, data_exch, resp->data[0]); if (rc) goto error; - kfree_skb(resp); - return; + goto free_resp; } exit: @@ -1225,8 +1224,7 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, ddev->atn_count++; - kfree_skb(resp); - return; + goto free_resp; } rc = nfc_tm_data_received(ddev->nfc_dev, resp); -- cgit v1.2.3 From 1a09c56f545c8ff8d338a38c7c40d79f4165a94c Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Fri, 8 Jul 2016 15:52:45 +0200 Subject: NFC: digital: Add support for NFC DEP Response Waiting Time When sending an ATR_REQ, the initiator must wait for the ATR_RES at least 'RWT(nfcdep,activation) + dRWT(nfcdep)' and no more than 'RWT(nfcdep,activation) + dRWT(nfcdep) + dT(nfcdep,initiator)'. This gives a timeout value between 1237 ms and 1337 ms. This patch defines DIGITAL_ATR_RES_RWT to 1337 used for the timeout value of ATR_REQ command. For other DEP PDUs, the initiator must wait between 'RWT + dRWT(nfcdep)' and 'RWT + dRWT(nfcdep) + dT(nfcdep,initiator)' where RWT is given by the following formula: '(256 * 16 / f(c)) * 2^wt' where wt is the value of the TO field in the ATR_RES response and is in the range between 0 and 14. This patch declares a mapping table for wt values and gives RWT max values between 100 ms and 5049 ms. This patch also defines DIGITAL_ATR_RES_TO_WT, the maximum wt value in target mode, to 8. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_dep.c | 71 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index ba52a5dbf3cc..6cf2eeb2e865 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -35,6 +35,8 @@ #define DIGITAL_ATR_REQ_MIN_SIZE 16 #define DIGITAL_ATR_REQ_MAX_SIZE 64 +#define DIGITAL_ATR_RES_TO_WT(s) ((s) & 0xF) + #define DIGITAL_DID_MAX 14 #define DIGITAL_PAYLOAD_SIZE_MAX 254 @@ -122,6 +124,37 @@ static const u8 digital_payload_bits_map[4] = { [3] = 254 }; +/* Response Waiting Time for ATR_RES PDU in ms + * + * RWT(ATR_RES) = RWT(nfcdep,activation) + dRWT(nfcdep) + dT(nfcdep,initiator) + * + * with: + * RWT(nfcdep,activation) = 4096 * 2^12 / f(c) s + * dRWT(nfcdep) = 16 / f(c) s + * dT(nfcdep,initiator) = 100 ms + * f(c) = 13560000 Hz + */ +#define DIGITAL_ATR_RES_RWT 1337 + +/* Response Waiting Time for other DEP PDUs in ms + * + * max_rwt = rwt + dRWT(nfcdep) + dT(nfcdep,initiator) + * + * with: + * rwt = (256 * 16 / f(c)) * 2^wt s + * dRWT(nfcdep) = 16 / f(c) s + * dT(nfcdep,initiator) = 100 ms + * f(c) = 13560000 Hz + * 0 <= wt <= 14 (given by the target by the TO field of ATR_RES response) + */ +#define DIGITAL_NFC_DEP_IN_MAX_WT 14 +#define DIGITAL_NFC_DEP_TG_MAX_WT 8 +static const u16 digital_rwt_map[DIGITAL_NFC_DEP_IN_MAX_WT + 1] = { + 100, 101, 101, 102, 105, + 110, 119, 139, 177, 255, + 409, 719, 1337, 2575, 5049, +}; + static u8 digital_payload_bits_to_size(u8 payload_bits) { if (payload_bits >= ARRAY_SIZE(digital_payload_bits_map)) @@ -366,8 +399,8 @@ static int digital_in_send_psl_req(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_psl_res, - target); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_psl_res, target); if (rc) kfree_skb(skb); @@ -380,6 +413,7 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg, struct nfc_target *target = arg; struct digital_atr_res *atr_res; u8 gb_len, payload_bits; + u8 wt; int rc; if (IS_ERR(resp)) { @@ -409,6 +443,11 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg, atr_res = (struct digital_atr_res *)resp->data; + wt = DIGITAL_ATR_RES_TO_WT(atr_res->to); + if (wt > DIGITAL_NFC_DEP_IN_MAX_WT) + wt = DIGITAL_NFC_DEP_IN_MAX_WT; + ddev->dep_rwt = digital_rwt_map[wt]; + payload_bits = DIGITAL_PAYLOAD_PP_TO_BITS(atr_res->pp); ddev->remote_payload_max = digital_payload_bits_to_size(payload_bits); @@ -490,8 +529,8 @@ int digital_in_send_atr_req(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_atr_res, - target); + rc = digital_in_send_cmd(ddev, skb, DIGITAL_ATR_RES_RWT, + digital_in_recv_atr_res, target); if (rc) kfree_skb(skb); @@ -524,8 +563,8 @@ static int digital_in_send_ack(struct nfc_digital_dev *ddev, ddev->saved_skb = pskb_copy(skb, GFP_KERNEL); - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) { kfree_skb(skb); kfree_skb(ddev->saved_skb); @@ -559,8 +598,8 @@ static int digital_in_send_nack(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) kfree_skb(skb); @@ -590,8 +629,8 @@ static int digital_in_send_atn(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) kfree_skb(skb); @@ -624,8 +663,8 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) kfree_skb(skb); @@ -642,7 +681,7 @@ static int digital_in_send_saved_skb(struct nfc_digital_dev *ddev, skb_get(ddev->saved_skb); - rc = digital_in_send_cmd(ddev, ddev->saved_skb, 1500, + rc = digital_in_send_cmd(ddev, ddev->saved_skb, ddev->dep_rwt, digital_in_recv_dep_res, data_exch); if (rc) kfree_skb(ddev->saved_skb); @@ -885,8 +924,8 @@ int digital_in_send_dep_req(struct nfc_digital_dev *ddev, ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL); - rc = digital_in_send_cmd(ddev, tmp_skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, tmp_skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) { if (tmp_skb != skb) kfree_skb(tmp_skb); @@ -1465,7 +1504,7 @@ static int digital_tg_send_atr_res(struct nfc_digital_dev *ddev, atr_res->dir = DIGITAL_NFC_DEP_FRAME_DIR_IN; atr_res->cmd = DIGITAL_CMD_ATR_RES; memcpy(atr_res->nfcid3, atr_req->nfcid3, sizeof(atr_req->nfcid3)); - atr_res->to = 8; + atr_res->to = DIGITAL_NFC_DEP_TG_MAX_WT; ddev->local_payload_max = DIGITAL_PAYLOAD_SIZE_MAX; payload_bits = digital_payload_size_to_bits(ddev->local_payload_max); -- cgit v1.2.3 From d85a301c26621d3466956dc477c32c20c15a52ee Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Fri, 8 Jul 2016 15:52:46 +0200 Subject: NFC: digital: Fix RTOX supervisor PDU handling When the target needs more time to process the received PDU, it sends Response Timeout Extension (RTOX) PDU. When the initiator receives a RTOX PDU, it must reply with a RTOX PDU and extends the current rwt value with the formula: rwt_int = rwt * rtox This patch takes care of the rtox value passed by the target in the RTOX PDU and extends the timeout for the next response accordingly. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- net/nfc/digital_dep.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index 6cf2eeb2e865..f864ce19e13d 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -65,6 +65,9 @@ #define DIGITAL_NFC_DEP_DID_BIT_SET(pfb) ((pfb) & DIGITAL_NFC_DEP_PFB_DID_BIT) #define DIGITAL_NFC_DEP_PFB_PNI(pfb) ((pfb) & 0x03) +#define DIGITAL_NFC_DEP_RTOX_VALUE(data) ((data) & 0x3F) +#define DIGITAL_NFC_DEP_RTOX_MAX 59 + #define DIGITAL_NFC_DEP_PFB_I_PDU 0x00 #define DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU 0x40 #define DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU 0x80 @@ -643,6 +646,11 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev, struct digital_dep_req_res *dep_req; struct sk_buff *skb; int rc; + u16 rwt_int; + + rwt_int = ddev->dep_rwt * rtox; + if (rwt_int > digital_rwt_map[DIGITAL_NFC_DEP_IN_MAX_WT]) + rwt_int = digital_rwt_map[DIGITAL_NFC_DEP_IN_MAX_WT]; skb = digital_skb_alloc(ddev, 1); if (!skb) @@ -663,7 +671,7 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + rc = digital_in_send_cmd(ddev, skb, rwt_int, digital_in_recv_dep_res, data_exch); if (rc) kfree_skb(skb); @@ -697,6 +705,7 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, u8 pfb; uint size; int rc; + u8 rtox; if (IS_ERR(resp)) { rc = PTR_ERR(resp); @@ -865,7 +874,20 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, goto free_resp; } - rc = digital_in_send_rtox(ddev, data_exch, resp->data[0]); + if (ddev->atn_count || ddev->nack_count) { + PROTOCOL_ERR("14.12.4.4"); + rc = -EIO; + goto error; + } + + rtox = DIGITAL_NFC_DEP_RTOX_VALUE(resp->data[0]); + if (!rtox || rtox > DIGITAL_NFC_DEP_RTOX_MAX) { + PROTOCOL_ERR("14.8.4.1"); + rc = -EIO; + goto error; + } + + rc = digital_in_send_rtox(ddev, data_exch, rtox); if (rc) goto error; -- cgit v1.2.3 From 64b87639c9cbeb03e26bc65528416c961b1dde96 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 3 Jul 2016 13:18:43 +0800 Subject: netfilter: conntrack: fix race between nf_conntrack proc read and hash resize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we do "cat /proc/net/nf_conntrack", and meanwhile resize the conntrack hash table via /sys/module/nf_conntrack/parameters/hashsize, race will happen, because reader can observe a newly allocated hash but the old size (or vice versa). So oops will happen like follows: BUG: unable to handle kernel NULL pointer dereference at 0000000000000017 IP: [] seq_print_acct+0x11/0x50 [nf_conntrack] Call Trace: [] ? ct_seq_show+0x14e/0x340 [nf_conntrack] [] seq_read+0x2cc/0x390 [] proc_reg_read+0x42/0x70 [] __vfs_read+0x37/0x130 [] ? security_file_permission+0xa0/0xc0 [] vfs_read+0x95/0x140 [] SyS_read+0x55/0xc0 [] entry_SYSCALL_64_fastpath+0x1a/0xa4 It is very easy to reproduce this kernel crash. 1. open one shell and input the following cmds: while : ; do echo $RANDOM > /sys/module/nf_conntrack/parameters/hashsize done 2. open more shells and input the following cmds: while : ; do cat /proc/net/nf_conntrack done 3. just wait a monent, oops will happen soon. The solution in this patch is based on Florian's Commit 5e3c61f98175 ("netfilter: conntrack: fix lookup race during hash resize"). And add a wrapper function nf_conntrack_get_ht to get hash and hsize suggested by Florian Westphal. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | 14 ++++++++++---- net/netfilter/nf_conntrack_core.c | 17 +++++++++++++++++ net/netfilter/nf_conntrack_standalone.c | 14 +++++++++----- 3 files changed, 36 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index c6f3c406f707..63923710f325 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -26,6 +26,8 @@ struct ct_iter_state { struct seq_net_private p; + struct hlist_nulls_head *hash; + unsigned int htable_size; unsigned int bucket; }; @@ -35,10 +37,10 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < st->htable_size; st->bucket++) { n = rcu_dereference( - hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); + hlist_nulls_first_rcu(&st->hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -53,11 +55,11 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= nf_conntrack_htable_size) + if (++st->bucket >= st->htable_size) return NULL; } head = rcu_dereference( - hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); + hlist_nulls_first_rcu(&st->hash[st->bucket])); } return head; } @@ -75,7 +77,11 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { + struct ct_iter_state *st = seq->private; + rcu_read_lock(); + + nf_conntrack_get_ht(&st->hash, &st->htable_size); return ct_get_idx(seq, *pos); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 153e33ffeeaa..1289e7e5e0de 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -460,6 +460,23 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, net_eq(net, nf_ct_net(ct)); } +/* must be called with rcu read lock held */ +void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) +{ + struct hlist_nulls_head *hptr; + unsigned int sequence, hsz; + + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hsz = nf_conntrack_htable_size; + hptr = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + *hash = hptr; + *hsize = hsz; +} +EXPORT_SYMBOL_GPL(nf_conntrack_get_ht); + /* * Warning : * - Caller must take a reference on returned object diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 2aaa188ee961..958a1455ca7f 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -48,6 +48,8 @@ EXPORT_SYMBOL_GPL(print_tuple); struct ct_iter_state { struct seq_net_private p; + struct hlist_nulls_head *hash; + unsigned int htable_size; unsigned int bucket; u_int64_t time_now; }; @@ -58,9 +60,10 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < st->htable_size; st->bucket++) { - n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); + n = rcu_dereference( + hlist_nulls_first_rcu(&st->hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -75,12 +78,11 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= nf_conntrack_htable_size) + if (++st->bucket >= st->htable_size) return NULL; } head = rcu_dereference( - hlist_nulls_first_rcu( - &nf_conntrack_hash[st->bucket])); + hlist_nulls_first_rcu(&st->hash[st->bucket])); } return head; } @@ -102,6 +104,8 @@ static void *ct_seq_start(struct seq_file *seq, loff_t *pos) st->time_now = ktime_get_real_ns(); rcu_read_lock(); + + nf_conntrack_get_ht(&st->hash, &st->htable_size); return ct_get_idx(seq, *pos); } -- cgit v1.2.3 From 474803d37e7fb6291d22cb964014afe457ba5212 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 3 Jul 2016 13:18:44 +0800 Subject: netfilter: cttimeout: unlink timeout obj again when hash resize happen Imagine such situation, nf_conntrack_htable_size now is 4096, we are doing ctnl_untimeout, and iterate on 3000# bucket. Meanwhile, another user try to reduce hash size to 2048, then all nf_conn are removed to the new hashtable. When this hash resize operation finished, we still try to itreate ct begin from 3000# bucket, find nothing to do and just return. We may miss unlinking some timeout objects. And later we will end up with invalid references to timeout object that are already gone. So when we find that hash resize happened, try to unlink timeout objects from the 0# bucket again. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cttimeout.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 3c84f14326f5..4cdcd969b64c 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -303,16 +303,24 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) { struct nf_conntrack_tuple_hash *h; const struct hlist_nulls_node *nn; + unsigned int last_hsize; + spinlock_t *lock; int i; local_bh_disable(); - for (i = 0; i < nf_conntrack_htable_size; i++) { - nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < nf_conntrack_htable_size) { - hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) - untimeout(h, timeout); +restart: + last_hsize = nf_conntrack_htable_size; + for (i = 0; i < last_hsize; i++) { + lock = &nf_conntrack_locks[i % CONNTRACK_LOCKS]; + nf_conntrack_lock(lock); + if (last_hsize != nf_conntrack_htable_size) { + spin_unlock(lock); + goto restart; } - spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) + untimeout(h, timeout); + spin_unlock(lock); } local_bh_enable(); } -- cgit v1.2.3 From 8786a9716d028083f56f944996883f7d1a05919e Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 3 Jul 2016 13:18:45 +0800 Subject: netfilter: nf_ct_helper: unlink helper again when hash resize happen From: Liping Zhang Similar to ctnl_untimeout, when hash resize happened, we should try to do unhelp from the 0# bucket again. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_helper.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 3a1a88b9bafa..a4294e949cdc 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -409,6 +409,8 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) struct nf_conntrack_expect *exp; const struct hlist_node *next; const struct hlist_nulls_node *nn; + unsigned int last_hsize; + spinlock_t *lock; struct net *net; unsigned int i; @@ -446,13 +448,18 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) rtnl_unlock(); local_bh_disable(); - for (i = 0; i < nf_conntrack_htable_size; i++) { - nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < nf_conntrack_htable_size) { - hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) - unhelp(h, me); +restart: + last_hsize = nf_conntrack_htable_size; + for (i = 0; i < last_hsize; i++) { + lock = &nf_conntrack_locks[i % CONNTRACK_LOCKS]; + nf_conntrack_lock(lock); + if (last_hsize != nf_conntrack_htable_size) { + spin_unlock(lock); + goto restart; } - spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) + unhelp(h, me); + spin_unlock(lock); } local_bh_enable(); } -- cgit v1.2.3 From 242922a027176cd260c5adce4ba6bbfa3a05190c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 3 Jul 2016 20:44:01 +0200 Subject: netfilter: conntrack: simplify early_drop We don't need to acquire the bucket lock during early drop, we can use lockless traveral just like ____nf_conntrack_find. The timer deletion serves as synchronization point, if another cpu attempts to evict same entry, only one will succeed with timer deletion. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 95 +++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1289e7e5e0de..e0e9c9a0f5ba 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -834,67 +834,66 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ -static noinline int early_drop(struct net *net, unsigned int _hash) +static unsigned int early_drop_list(struct net *net, + struct hlist_nulls_head *head) { - /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; - struct nf_conn *tmp; struct hlist_nulls_node *n; - unsigned int i, hash, sequence; - struct nf_conn *ct = NULL; - spinlock_t *lockp; - bool ret = false; + unsigned int drops = 0; + struct nf_conn *tmp; - i = 0; + hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { + tmp = nf_ct_tuplehash_to_ctrack(h); - local_bh_disable(); -restart: - sequence = read_seqcount_begin(&nf_conntrack_generation); - for (; i < NF_CT_EVICTION_RANGE; i++) { - hash = scale_hash(_hash++); - lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; - nf_conntrack_lock(lockp); - if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { - spin_unlock(lockp); - goto restart; - } - hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], - hnnode) { - tmp = nf_ct_tuplehash_to_ctrack(h); - - if (test_bit(IPS_ASSURED_BIT, &tmp->status) || - !net_eq(nf_ct_net(tmp), net) || - nf_ct_is_dying(tmp)) - continue; - - if (atomic_inc_not_zero(&tmp->ct_general.use)) { - ct = tmp; - break; - } - } + if (test_bit(IPS_ASSURED_BIT, &tmp->status) || + !net_eq(nf_ct_net(tmp), net) || + nf_ct_is_dying(tmp)) + continue; - spin_unlock(lockp); - if (ct) - break; + if (!atomic_inc_not_zero(&tmp->ct_general.use)) + continue; + + /* kill only if still in same netns -- might have moved due to + * SLAB_DESTROY_BY_RCU rules. + * + * We steal the timer reference. If that fails timer has + * already fired or someone else deleted it. Just drop ref + * and move to next entry. + */ + if (net_eq(nf_ct_net(tmp), net) && + nf_ct_is_confirmed(tmp) && + del_timer(&tmp->timeout) && + nf_ct_delete(tmp, 0, 0)) + drops++; + + nf_ct_put(tmp); } - local_bh_enable(); + return drops; +} - if (!ct) - return false; +static noinline int early_drop(struct net *net, unsigned int _hash) +{ + unsigned int i; - /* kill only if in same netns -- might have moved due to - * SLAB_DESTROY_BY_RCU rules - */ - if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) { - if (nf_ct_delete(ct, 0, 0)) { - NF_CT_STAT_INC_ATOMIC(net, early_drop); - ret = true; + for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { + struct hlist_nulls_head *ct_hash; + unsigned hash, sequence, drops; + + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hash = scale_hash(_hash++); + ct_hash = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + drops = early_drop_list(net, &ct_hash[hash]); + if (drops) { + NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); + return true; } } - nf_ct_put(ct); - return ret; + return false; } static struct nf_conn * -- cgit v1.2.3 From 7c9664351980aaa6a4b8837a314360b3a4ad382a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 5 Jul 2016 12:07:23 +0200 Subject: netfilter: move nat hlist_head to nf_conn The nat extension structure is 32bytes in size on x86_64: struct nf_conn_nat { struct hlist_node bysource; /* 0 16 */ struct nf_conn * ct; /* 16 8 */ union nf_conntrack_nat_help help; /* 24 4 */ int masq_index; /* 28 4 */ /* size: 32, cachelines: 1, members: 4 */ /* last cacheline: 32 bytes */ }; The hlist is needed to quickly check for possible tuple collisions when installing a new nat binding. Storing this in the extension area has two drawbacks: 1. We need ct backpointer to get the conntrack struct from the extension. 2. When reallocation of extension area occurs we need to fixup the bysource hash head via hlist_replace_rcu. We can avoid both by placing the hlist_head in nf_conn and place nf_conn in the bysource hash rather than the extenstion. We can also remove the ->move support; no other extension needs it. Moving the entire nat extension into nf_conn would be possible as well but then we have to add yet another callback for deletion from the bysource hash table rather than just using nat extension ->destroy hook for this. nf_conn size doesn't increase due to aligment, followup patch replaces hlist_node with single pointer. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_extend.c | 15 ++------------- net/netfilter/nf_nat_core.c | 33 +++++++-------------------------- 2 files changed, 9 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 1a9545965c0d..02bcf00c2492 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -73,7 +73,7 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, size_t var_alloc_len, gfp_t gfp) { struct nf_ct_ext *old, *new; - int i, newlen, newoff; + int newlen, newoff; struct nf_ct_ext_type *t; /* Conntrack must not be confirmed to avoid races on reallocation. */ @@ -99,19 +99,8 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, return NULL; if (new != old) { - for (i = 0; i < NF_CT_EXT_NUM; i++) { - if (!__nf_ct_ext_exist(old, i)) - continue; - - rcu_read_lock(); - t = rcu_dereference(nf_ct_ext_types[i]); - if (t && t->move) - t->move((void *)new + new->offset[i], - (void *)old + old->offset[i]); - rcu_read_unlock(); - } kfree_rcu(old, rcu); - ct->ext = new; + rcu_assign_pointer(ct->ext, new); } new->offset[id] = newoff; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 6877a396f8fc..692534701426 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -198,11 +198,9 @@ find_appropriate_src(struct net *net, const struct nf_nat_range *range) { unsigned int h = hash_by_src(net, tuple); - const struct nf_conn_nat *nat; const struct nf_conn *ct; - hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) { - ct = nat->ct; + hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { if (same_src(ct, tuple) && net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { @@ -435,8 +433,7 @@ nf_nat_setup_info(struct nf_conn *ct, spin_lock_bh(&nf_nat_lock); /* nf_conntrack_alter_reply might re-allocate extension aera */ nat = nfct_nat(ct); - nat->ct = ct; - hlist_add_head_rcu(&nat->bysource, + hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(&nf_nat_lock); } @@ -543,7 +540,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) if (nf_nat_proto_remove(ct, data)) return 1; - if (!nat || !nat->ct) + if (!nat) return 0; /* This netns is being destroyed, and conntrack has nat null binding. @@ -556,9 +553,8 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) return 1; spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&nat->bysource); + hlist_del_rcu(&ct->nat_bysource); ct->status &= ~IPS_NAT_DONE_MASK; - nat->ct = NULL; spin_unlock_bh(&nf_nat_lock); add_timer(&ct->timeout); @@ -688,27 +684,13 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); - if (nat == NULL || nat->ct == NULL) + if (!nat) return; - NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); - - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&nat->bysource); - spin_unlock_bh(&nf_nat_lock); -} - -static void nf_nat_move_storage(void *new, void *old) -{ - struct nf_conn_nat *new_nat = new; - struct nf_conn_nat *old_nat = old; - struct nf_conn *ct = old_nat->ct; - - if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) - return; + NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE); spin_lock_bh(&nf_nat_lock); - hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); + hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_lock); } @@ -716,7 +698,6 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { .len = sizeof(struct nf_conn_nat), .align = __alignof__(struct nf_conn_nat), .destroy = nf_nat_cleanup_conntrack, - .move = nf_nat_move_storage, .id = NF_CT_EXT_NAT, .flags = NF_CT_EXT_F_PREALLOC, }; -- cgit v1.2.3 From 870190a9ec9075205c0fa795a09fa931694a3ff1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 5 Jul 2016 12:07:24 +0200 Subject: netfilter: nat: convert nat bysrc hash to rhashtable It did use a fixed-size bucket list plus single lock to protect add/del. Unlike the main conntrack table we only need to add and remove keys. Convert it to rhashtable to get table autosizing and per-bucket locking. The maximum number of entries is -- as before -- tied to the number of conntracks so we do not need another upperlimit. The change does not handle rhashtable_remove_fast error, only possible "error" is -ENOENT, and that is something that can happen legitimetely, e.g. because nat module was inserted at a later time and no src manip took place yet. Tested with http-client-benchmark + httpterm with DNAT and SNAT rules in place. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 126 ++++++++++++++++++++++++-------------------- 1 file changed, 68 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 692534701426..de31818417b8 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -30,17 +30,19 @@ #include #include -static DEFINE_SPINLOCK(nf_nat_lock); - static DEFINE_MUTEX(nf_nat_proto_mutex); static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; -static struct hlist_head *nf_nat_bysource __read_mostly; -static unsigned int nf_nat_htable_size __read_mostly; -static unsigned int nf_nat_hash_rnd __read_mostly; +struct nf_nat_conn_key { + const struct net *net; + const struct nf_conntrack_tuple *tuple; + const struct nf_conntrack_zone *zone; +}; + +static struct rhashtable nf_nat_bysource_table; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -119,19 +121,17 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) EXPORT_SYMBOL(nf_xfrm_me_harder); #endif /* CONFIG_XFRM */ -/* We keep an extra hash for each conntrack, for fast searching. */ -static inline unsigned int -hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) +static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) { - unsigned int hash; - - get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + const struct nf_conntrack_tuple *t; + const struct nf_conn *ct = data; + t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* Original src, to ensure we map it consistently if poss. */ - hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - return reciprocal_scale(hash, nf_nat_htable_size); + seed ^= net_hash_mix(nf_ct_net(ct)); + return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32), + t->dst.protonum ^ seed); } /* Is this tuple already taken? (not by us) */ @@ -187,6 +187,26 @@ same_src(const struct nf_conn *ct, t->src.u.all == tuple->src.u.all); } +static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct nf_nat_conn_key *key = arg->key; + const struct nf_conn *ct = obj; + + return same_src(ct, key->tuple) && + net_eq(nf_ct_net(ct), key->net) && + nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL); +} + +static struct rhashtable_params nf_nat_bysource_params = { + .head_offset = offsetof(struct nf_conn, nat_bysource), + .obj_hashfn = nf_nat_bysource_hash, + .obj_cmpfn = nf_nat_bysource_cmp, + .nelem_hint = 256, + .min_size = 1024, + .nulls_base = (1U << RHT_BASE_SHIFT), +}; + /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, @@ -197,23 +217,23 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { - unsigned int h = hash_by_src(net, tuple); const struct nf_conn *ct; + struct nf_nat_conn_key key = { + .net = net, + .tuple = tuple, + .zone = zone + }; - hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { - if (same_src(ct, tuple) && - net_eq(net, nf_ct_net(ct)) && - nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { - /* Copy source part from reply tuple. */ - nf_ct_invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; - - if (in_range(l3proto, l4proto, result, range)) - return 1; - } - } - return 0; + ct = rhashtable_lookup_fast(&nf_nat_bysource_table, &key, + nf_nat_bysource_params); + if (!ct) + return 0; + + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + return in_range(l3proto, l4proto, result, range); } /* For [FUTURE] fragmentation handling, we want the least-used @@ -385,7 +405,6 @@ nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) { - struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; @@ -426,16 +445,13 @@ nf_nat_setup_info(struct nf_conn *ct, } if (maniptype == NF_NAT_MANIP_SRC) { - unsigned int srchash; - - srchash = hash_by_src(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_lock); - /* nf_conntrack_alter_reply might re-allocate extension aera */ - nat = nfct_nat(ct); - hlist_add_head_rcu(&ct->nat_bysource, - &nf_nat_bysource[srchash]); - spin_unlock_bh(&nf_nat_lock); + int err; + + err = rhashtable_insert_fast(&nf_nat_bysource_table, + &ct->nat_bysource, + nf_nat_bysource_params); + if (err) + return NF_DROP; } /* It's done. */ @@ -552,10 +568,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) if (!del_timer(&ct->timeout)) return 1; - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); ct->status &= ~IPS_NAT_DONE_MASK; - spin_unlock_bh(&nf_nat_lock); + + rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, + nf_nat_bysource_params); add_timer(&ct->timeout); @@ -687,11 +703,8 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) if (!nat) return; - NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE); - - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_lock); + rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, + nf_nat_bysource_params); } static struct nf_ct_ext_type nat_extend __read_mostly = { @@ -826,16 +839,13 @@ static int __init nf_nat_init(void) { int ret; - /* Leave them the same for the moment. */ - nf_nat_htable_size = nf_conntrack_htable_size; - - nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); - if (!nf_nat_bysource) - return -ENOMEM; + ret = rhashtable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); + if (ret) + return ret; ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + rhashtable_destroy(&nf_nat_bysource_table); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -859,7 +869,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend: - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + rhashtable_destroy(&nf_nat_bysource_table); nf_ct_extend_unregister(&nat_extend); return ret; } @@ -877,8 +887,8 @@ static void __exit nf_nat_cleanup(void) #endif for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); - synchronize_net(); - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + + rhashtable_destroy(&nf_nat_bysource_table); } MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 47c74456257d2e50ace8c473d358cf4b9c0a912f Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 5 Jul 2016 20:55:36 +0800 Subject: netfilter: physdev: physdev-is-out should not work with OUTPUT chain physdev_mt() will check skb->nf_bridge first, which was alloced in br_nf_pre_routing. So if we want to use --physdev-out and physdev-is-out, we need to match it in FORWARD or POSTROUTING chain. physdev_mt_check() only checked physdev-out and missed physdev-is-out. Fix it and update the debug message to make it clearer. Signed-off-by: Hangbin Liu Reviewed-by: Marcelo R Leitner Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_physdev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 1caaccbc306c..e5f18988aee0 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -102,14 +102,14 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || info->bitmask & ~XT_PHYSDEV_OP_MASK) return -EINVAL; - if (info->bitmask & XT_PHYSDEV_OP_OUT && + if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { - pr_info("using --physdev-out in the OUTPUT, FORWARD and " - "POSTROUTING chains for non-bridged traffic is not " - "supported anymore.\n"); + pr_info("using --physdev-out and --physdev-is-out are only" + "supported in the FORWARD and POSTROUTING chains with" + "bridged traffic.\n"); if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) return -EINVAL; } -- cgit v1.2.3 From 3f8b61b7f9aea414d162821817d89a7a6aae41c3 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Tue, 5 Jul 2016 23:23:00 +0800 Subject: netfilter: nft_ct: make byte/packet expr more friendly If we want to use ct packets expr, and add a rule like follows: # nft add rule filter input ct packets gt 1 counter We will find that no packets will hit it, because nf_conntrack_acct is disabled by default. So It will not work until we enable it manually via "echo 1 > /proc/sys/net/netfilter/nf_conntrack_acct". This is not friendly, so like xt_connbytes do, if the user want to use ct byte/packet expr, enable nf_conntrack_acct automatically. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 137e308d5b24..7ce8fd7ace78 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -355,6 +355,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (err < 0) return err; + if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS) + nf_ct_set_acct(ctx->net, true); + return 0; } -- cgit v1.2.3 From 42a55769132fdf4f44bac1471b371d7f80bcde35 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 8 Jul 2016 14:41:49 +0200 Subject: netfilter: nf_tables: get rid of possible_net_t from set and basechain We can pass the netns pointer as parameter to the functions that need to gain access to it. From basechains, I didn't find any client for this field anymore so let's remove this too. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 10 ++++------ net/netfilter/nft_hash.c | 20 ++++++++++---------- net/netfilter/nft_lookup.c | 2 +- net/netfilter/nft_rbtree.c | 26 ++++++++++++++------------ 4 files changed, 29 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 18b7f8578ee0..0211eaec9060 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1405,7 +1405,6 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, rcu_assign_pointer(basechain->stats, stats); } - write_pnet(&basechain->pnet, net); basechain->type = type; chain = &basechain->chain; @@ -2841,7 +2840,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } INIT_LIST_HEAD(&set->bindings); - write_pnet(&set->pnet, net); set->ops = ops; set->ktype = ktype; set->klen = desc.klen; @@ -3520,7 +3518,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err4; ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; - err = set->ops->insert(set, &elem); + err = set->ops->insert(ctx->net, set, &elem); if (err < 0) goto err5; @@ -3644,7 +3642,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, goto err3; } - priv = set->ops->deactivate(set, &elem); + priv = set->ops->deactivate(ctx->net, set, &elem); if (priv == NULL) { err = -ENOENT; goto err4; @@ -4018,7 +4016,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; - te->set->ops->activate(te->set, &te->elem); + te->set->ops->activate(net, te->set, &te->elem); nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_NEWSETELEM, 0); @@ -4143,7 +4141,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; - te->set->ops->activate(te->set, &te->elem); + te->set->ops->activate(net, te->set, &te->elem); te->set->ndeact--; nft_trans_destroy(trans); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index ea924816b7b8..564fa7929ed5 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -71,13 +71,13 @@ static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg, return 0; } -static bool nft_hash_lookup(const struct nft_set *set, const u32 *key, - const struct nft_set_ext **ext) +static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); const struct nft_hash_elem *he; struct nft_hash_cmp_arg arg = { - .genmask = nft_genmask_cur(read_pnet(&set->pnet)), + .genmask = nft_genmask_cur(net), .set = set, .key = key, }; @@ -125,13 +125,13 @@ err1: return false; } -static int nft_hash_insert(const struct nft_set *set, +static int nft_hash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he = elem->priv; struct nft_hash_cmp_arg arg = { - .genmask = nft_genmask_next(read_pnet(&set->pnet)), + .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, }; @@ -140,20 +140,20 @@ static int nft_hash_insert(const struct nft_set *set, nft_hash_params); } -static void nft_hash_activate(const struct nft_set *set, +static void nft_hash_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash_elem *he = elem->priv; - nft_set_elem_change_active(set, &he->ext); + nft_set_elem_change_active(net, set, &he->ext); nft_set_elem_clear_busy(&he->ext); } -static void *nft_hash_deactivate(const struct nft_set *set, +static void *nft_hash_deactivate(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash *priv = nft_set_priv(set); - struct net *net = read_pnet(&set->pnet); struct nft_hash_elem *he; struct nft_hash_cmp_arg arg = { .genmask = nft_genmask_next(net), @@ -166,7 +166,7 @@ static void *nft_hash_deactivate(const struct nft_set *set, if (he != NULL) { if (!nft_set_elem_mark_busy(&he->ext) || !nft_is_active(net, &he->ext)) - nft_set_elem_change_active(set, &he->ext); + nft_set_elem_change_active(net, set, &he->ext); else he = NULL; } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index b8d18f598569..e164325d1bc0 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -35,7 +35,7 @@ static void nft_lookup_eval(const struct nft_expr *expr, const struct nft_set_ext *ext; bool found; - found = set->ops->lookup(set, ®s->data[priv->sreg], &ext) ^ + found = set->ops->lookup(pkt->net, set, ®s->data[priv->sreg], &ext) ^ priv->invert; if (!found) { diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index c0f638745adc..6473936d05c6 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -41,13 +41,13 @@ static bool nft_rbtree_equal(const struct nft_set *set, const void *this, return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; } -static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, - const struct nft_set_ext **ext) +static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { const struct nft_rbtree *priv = nft_set_priv(set); const struct nft_rbtree_elem *rbe, *interval = NULL; + u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; - u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); const void *this; int d; @@ -93,13 +93,13 @@ out: return false; } -static int __nft_rbtree_insert(const struct nft_set *set, +static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new) { struct nft_rbtree *priv = nft_set_priv(set); + u8 genmask = nft_genmask_next(net); struct nft_rbtree_elem *rbe; struct rb_node *parent, **p; - u8 genmask = nft_genmask_next(read_pnet(&set->pnet)); int d; parent = NULL; @@ -132,14 +132,14 @@ static int __nft_rbtree_insert(const struct nft_set *set, return 0; } -static int nft_rbtree_insert(const struct nft_set *set, +static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree_elem *rbe = elem->priv; int err; spin_lock_bh(&nft_rbtree_lock); - err = __nft_rbtree_insert(set, rbe); + err = __nft_rbtree_insert(net, set, rbe); spin_unlock_bh(&nft_rbtree_lock); return err; @@ -156,21 +156,23 @@ static void nft_rbtree_remove(const struct nft_set *set, spin_unlock_bh(&nft_rbtree_lock); } -static void nft_rbtree_activate(const struct nft_set *set, +static void nft_rbtree_activate(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree_elem *rbe = elem->priv; - nft_set_elem_change_active(set, &rbe->ext); + nft_set_elem_change_active(net, set, &rbe->ext); } -static void *nft_rbtree_deactivate(const struct nft_set *set, +static void *nft_rbtree_deactivate(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; struct nft_rbtree_elem *rbe, *this = elem->priv; - u8 genmask = nft_genmask_next(read_pnet(&set->pnet)); + u8 genmask = nft_genmask_next(net); int d; while (parent != NULL) { @@ -196,7 +198,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, parent = parent->rb_right; continue; } - nft_set_elem_change_active(set, &rbe->ext); + nft_set_elem_change_active(net, set, &rbe->ext); return rbe; } } -- cgit v1.2.3 From c2b9b4fee8ab86f2bb657e5ac48d803879e92765 Mon Sep 17 00:00:00 2001 From: Toby DiPasquale Date: Mon, 11 Jul 2016 11:32:45 +0100 Subject: netfilter: nf_conntrack_h323: fix off-by-one in DecodeQ931 This patch corrects an off-by-one error in the DecodeQ931 function in the nf_conntrack_h323 module. This error could result in reading off the end of a Q.931 frame. Signed-off-by: Toby DiPasquale Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_h323_asn1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index bcd5ed6b7130..89b2e46925c4 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -846,9 +846,10 @@ int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931) sz -= len; /* Message Type */ - if (sz < 1) + if (sz < 2) return H323_ERROR_BOUND; q931->MessageType = *p++; + sz--; PRINT("MessageType = %02X\n", q931->MessageType); if (*p & 0x80) { p++; -- cgit v1.2.3 From 28aa4c26fce2202db8d42ae76b639ca1d9a23d25 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 9 Jul 2016 19:47:40 +0800 Subject: sctp: add SCTP_PR_SUPPORTED on sctp sockopt According to section 4.5 of rfc7496, prsctp_enable should be per asoc. We will add prsctp_enable to both asoc and ep, and replace the places where it used net.sctp->prsctp_enable with asoc->prsctp_enable. ep->prsctp_enable will be initialized with net.sctp->prsctp_enable, and asoc->prsctp_enable will be initialized with ep->prsctp_enable. We can also modify it's value through sockopt SCTP_PR_SUPPORTED. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/associola.c | 1 + net/sctp/endpointola.c | 1 + net/sctp/sm_make_chunk.c | 12 ++++---- net/sctp/socket.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 88 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index e1849f3714ad..1c23060c41a6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -268,6 +268,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a goto fail_init; asoc->active_key_id = ep->active_key_id; + asoc->prsctp_enable = ep->prsctp_enable; /* Save the hmacs and chunks list into this association */ if (ep->auth_hmacs_list) diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 9d494e35e7f9..1f03065686fe 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -163,6 +163,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, */ ep->auth_hmacs_list = auth_hmacs; ep->auth_chunk_list = auth_chunks; + ep->prsctp_enable = net->sctp.prsctp_enable; return ep; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 56f364d8f932..0e3045ef57fa 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -261,7 +261,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types)); chunksize += sizeof(ecap_param); - if (net->sctp.prsctp_enable) + if (asoc->prsctp_enable) chunksize += sizeof(prsctp_param); /* ADDIP: Section 4.2.7: @@ -355,7 +355,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, sctp_addto_param(retval, num_ext, extensions); } - if (net->sctp.prsctp_enable) + if (asoc->prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { @@ -2024,8 +2024,8 @@ static void sctp_process_ext_param(struct sctp_association *asoc, for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_FWD_TSN: - if (net->sctp.prsctp_enable && !asoc->peer.prsctp_capable) - asoc->peer.prsctp_capable = 1; + if (asoc->prsctp_enable && !asoc->peer.prsctp_capable) + asoc->peer.prsctp_capable = 1; break; case SCTP_CID_AUTH: /* if the peer reports AUTH, assume that he @@ -2169,7 +2169,7 @@ static sctp_ierror_t sctp_verify_param(struct net *net, break; case SCTP_PARAM_FWD_TSN_SUPPORT: - if (net->sctp.prsctp_enable) + if (ep->prsctp_enable) break; goto fallthrough; @@ -2653,7 +2653,7 @@ do_addr_param: break; case SCTP_PARAM_FWD_TSN_SUPPORT: - if (net->sctp.prsctp_enable) { + if (asoc->prsctp_enable) { asoc->peer.prsctp_capable = 1; break; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index cdabbd8219b1..7460ddebd9ce 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3661,6 +3661,39 @@ static int sctp_setsockopt_recvnxtinfo(struct sock *sk, return 0; } +static int sctp_setsockopt_pr_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + asoc->prsctp_enable = !!params.assoc_value; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + sp->ep->prsctp_enable = !!params.assoc_value; + } else { + goto out; + } + + retval = 0; + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -3821,6 +3854,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_RECVNXTINFO: retval = sctp_setsockopt_recvnxtinfo(sk, optval, optlen); break; + case SCTP_PR_SUPPORTED: + retval = sctp_setsockopt_pr_supported(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6166,6 +6202,47 @@ static int sctp_getsockopt_recvnxtinfo(struct sock *sk, int len, return 0; } +static int sctp_getsockopt_pr_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + params.assoc_value = asoc->prsctp_enable; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + params.assoc_value = sp->ep->prsctp_enable; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -6319,6 +6396,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_RECVNXTINFO: retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen); break; + case SCTP_PR_SUPPORTED: + retval = sctp_getsockopt_pr_supported(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; -- cgit v1.2.3 From f959fb442c35f4b61fea341401b8463dd0a1b959 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 9 Jul 2016 19:47:41 +0800 Subject: sctp: add SCTP_DEFAULT_PRINFO into sctp sockopt This patch adds SCTP_DEFAULT_PRINFO to sctp sockopt. It is used to set/get sctp Partially Reliable Policies' default params, which includes 3 policies (ttl, rtx, prio) and their values. Still, if we set policy params in sndinfo, we will use the params of sndinfo against chunks, instead of the default params. In this patch, we will use 5-8bit of sp/asoc->default_flags to store prsctp policies, and reuse asoc->default_timetolive to store their values. It means if we enable and set prsctp policy, prior ttl timeout in sctp will not work any more. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 7460ddebd9ce..c03fe1b76706 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3694,6 +3694,47 @@ out: return retval; } +static int sctp_setsockopt_default_prinfo(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_default_prinfo info; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(info)) + goto out; + + if (copy_from_user(&info, optval, sizeof(info))) { + retval = -EFAULT; + goto out; + } + + if (info.pr_policy & ~SCTP_PR_SCTP_MASK) + goto out; + + if (info.pr_policy == SCTP_PR_SCTP_NONE) + info.pr_value = 0; + + asoc = sctp_id2assoc(sk, info.pr_assoc_id); + if (asoc) { + SCTP_PR_SET_POLICY(asoc->default_flags, info.pr_policy); + asoc->default_timetolive = info.pr_value; + } else if (!info.pr_assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + SCTP_PR_SET_POLICY(sp->default_flags, info.pr_policy); + sp->default_timetolive = info.pr_value; + } else { + goto out; + } + + retval = 0; + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -3857,6 +3898,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_PR_SUPPORTED: retval = sctp_setsockopt_pr_supported(sk, optval, optlen); break; + case SCTP_DEFAULT_PRINFO: + retval = sctp_setsockopt_default_prinfo(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6243,6 +6287,49 @@ out: return retval; } +static int sctp_getsockopt_default_prinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_default_prinfo info; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(info)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(info); + if (copy_from_user(&info, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, info.pr_assoc_id); + if (asoc) { + info.pr_policy = SCTP_PR_POLICY(asoc->default_flags); + info.pr_value = asoc->default_timetolive; + } else if (!info.pr_assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + info.pr_policy = SCTP_PR_POLICY(sp->default_flags); + info.pr_value = sp->default_timetolive; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, &info, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -6399,6 +6486,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_PR_SUPPORTED: retval = sctp_getsockopt_pr_supported(sk, len, optval, optlen); break; + case SCTP_DEFAULT_PRINFO: + retval = sctp_getsockopt_default_prinfo(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; -- cgit v1.2.3 From 826d253d57b11f69add81c8086d2e7f1dce5ec77 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 9 Jul 2016 19:47:42 +0800 Subject: sctp: add SCTP_PR_ASSOC_STATUS on sctp sockopt This patch adds SCTP_PR_ASSOC_STATUS to sctp sockopt, which is used to dump the prsctp statistics info from the asoc. The prsctp statistics includes abandoned_sent/unsent from the asoc. abandoned_sent is the count of the packets we drop packets from retransmit/transmited queue, and abandoned_unsent is the count of the packets we drop from out_queue according to the policy. Note: another option for prsctp statistics dump described in rfc is SCTP_PR_STREAM_STATUS, which is used to dump the prsctp statistics info from each stream. But by now, linux doesn't yet have per stream statistics info, it needs rfc6525 to be implemented. As the prsctp statistics for each stream has to be based on per stream statistics, we will delay it until rfc6525 is done in linux. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c03fe1b76706..c3167c43a9c1 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6330,6 +6330,64 @@ out: return retval; } +static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_prstatus params; + struct sctp_association *asoc; + int policy; + int retval = -EINVAL; + + if (len < sizeof(params)) + goto out; + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) { + retval = -EFAULT; + goto out; + } + + policy = params.sprstat_policy; + if (policy & ~SCTP_PR_SCTP_MASK) + goto out; + + asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); + if (!asoc) + goto out; + + if (policy == SCTP_PR_SCTP_NONE) { + params.sprstat_abandoned_unsent = 0; + params.sprstat_abandoned_sent = 0; + for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { + params.sprstat_abandoned_unsent += + asoc->abandoned_unsent[policy]; + params.sprstat_abandoned_sent += + asoc->abandoned_sent[policy]; + } + } else { + params.sprstat_abandoned_unsent = + asoc->abandoned_unsent[__SCTP_PR_INDEX(policy)]; + params.sprstat_abandoned_sent = + asoc->abandoned_sent[__SCTP_PR_INDEX(policy)]; + } + + if (put_user(len, optlen)) { + retval = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ¶ms, len)) { + retval = -EFAULT; + goto out; + } + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -6490,6 +6548,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_default_prinfo(sk, len, optval, optlen); break; + case SCTP_PR_ASSOC_STATUS: + retval = sctp_getsockopt_pr_assocstatus(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; -- cgit v1.2.3 From a6c2f792873aff332a4689717c3cd6104f46684c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 9 Jul 2016 19:47:43 +0800 Subject: sctp: implement prsctp TTL policy prsctp TTL policy is a policy to abandon chunks when they expire at the specific time in local stack. It's similar with expires_at in struct sctp_datamsg. This patch uses sinfo->sinfo_timetolive to set the specific time for TTL policy. sinfo->sinfo_timetolive is also used for msg->expires_at. So if prsctp_enable or TTL policy is not enabled, msg->expires_at still works as before. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/chunk.c | 20 +++++++++++++++++--- net/sctp/output.c | 2 ++ net/sctp/sm_make_chunk.c | 12 ++++++++++++ net/sctp/socket.c | 4 ++-- 4 files changed, 33 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 1eb94bf18ef4..2698d122e201 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -335,13 +335,27 @@ errout: /* Check whether this message has expired. */ int sctp_chunk_abandoned(struct sctp_chunk *chunk) { - struct sctp_datamsg *msg = chunk->msg; + if (!chunk->asoc->prsctp_enable || + !SCTP_PR_POLICY(chunk->sinfo.sinfo_flags)) { + struct sctp_datamsg *msg = chunk->msg; + + if (!msg->can_abandon) + return 0; + + if (time_after(jiffies, msg->expires_at)) + return 1; - if (!msg->can_abandon) return 0; + } - if (time_after(jiffies, msg->expires_at)) + if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && + time_after(jiffies, chunk->prsctp_param)) { + if (chunk->sent_count) + chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; + else + chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; return 1; + } return 0; } diff --git a/net/sctp/output.c b/net/sctp/output.c index 2e9223bb1b3a..7425f6c23888 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -316,6 +316,8 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet, packet->has_data = 1; /* timestamp the chunk for rtx purposes */ chunk->sent_at = jiffies; + /* Mainly used for prsctp RTX policy */ + chunk->sent_count++; break; case SCTP_CID_COOKIE_ECHO: packet->has_cookie_echo = 1; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 0e3045ef57fa..2c431eef3a50 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -711,6 +711,17 @@ nodata: return retval; } +static void sctp_set_prsctp_policy(struct sctp_chunk *chunk, + const struct sctp_sndrcvinfo *sinfo) +{ + if (!chunk->asoc->prsctp_enable) + return; + + if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags)) + chunk->prsctp_param = + jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); +} + /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ @@ -744,6 +755,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); + sctp_set_prsctp_policy(retval, sinfo); nodata: return retval; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c3167c43a9c1..08614296628a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7099,7 +7099,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | + SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -7123,7 +7123,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->sinfo->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | + SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; -- cgit v1.2.3 From 01aadb3af6e1b10970c1f7e510b5097c8f725d64 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 9 Jul 2016 19:47:44 +0800 Subject: sctp: implement prsctp RTX policy prsctp RTX policy is a policy to abandon chunks when they are retransmitted beyond the max count. This patch uses sent_count to count how many times one chunk has been sent, and prsctp_param is the max rtx count, which is from sinfo->sinfo_timetolive in sctp_set_prsctp_policy(). So similar to TTL policy, if RTX policy is enabled, msg->expire_at won't work. Then in sctp_chunk_abandoned, this patch checks if chunk->sent_count is bigger than chunk->prsctp_param to abandon this chunk. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/chunk.c | 4 ++++ net/sctp/sm_make_chunk.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 2698d122e201..b3692b55a8d2 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -355,6 +355,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) else chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; return 1; + } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && + chunk->sent_count > chunk->prsctp_param) { + chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; + return 1; } return 0; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 2c431eef3a50..cfde934af5c5 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -720,6 +720,8 @@ static void sctp_set_prsctp_policy(struct sctp_chunk *chunk, if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags)) chunk->prsctp_param = jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); + else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags)) + chunk->prsctp_param = sinfo->sinfo_timetolive; } /* Make a DATA chunk for the given association from the provided -- cgit v1.2.3 From 8dbdf1f5b09cb22560e7c7173b52fe3c631046bd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 9 Jul 2016 19:47:45 +0800 Subject: sctp: implement prsctp PRIO policy prsctp PRIO policy is a policy to abandon lower priority chunks when asoc doesn't have enough snd buffer, so that the current chunk with higher priority can be queued successfully. Similar to TTL/RTX policy, we will set the priority of the chunk to prsctp_param with sinfo->sinfo_timetolive in sctp_set_prsctp_policy(). So if PRIO policy is enabled, msg->expire_at won't work. asoc->sent_cnt_removable will record how many chunks can be checked to remove. If priority policy is enabled, when the chunk is queued into the out_queue, we will increase sent_cnt_removable. When the chunk is moved to abandon_queue or dequeue and free, we will decrease sent_cnt_removable. In sctp_sendmsg, we will check if there is enough snd buffer for current msg and if sent_cnt_removable is not 0. Then try to abandon chunks in sctp_prune_prsctp when sendmsg from the retransmit/transmited queue, and free chunks from out_queue in right order until the abandon+free size > msg_len - sctp_wfree. For the abandon size, we have to wait until it sends FORWARD TSN, receives the sack and the chunks are really freed. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/chunk.c | 1 + net/sctp/outqueue.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++ net/sctp/sm_make_chunk.c | 3 +- net/sctp/socket.c | 3 ++ 4 files changed, 105 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index b3692b55a8d2..a55e54738b81 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -360,6 +360,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; return 1; } + /* PRIO policy is processed by sendmsg, not here */ return 0; } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 084718f9b3da..72e54a416af6 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -326,6 +326,9 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) sctp_chunk_hold(chunk); sctp_outq_tail_data(q, chunk); + if (chunk->asoc->prsctp_enable && + SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) + chunk->asoc->sent_cnt_removable++; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); else @@ -372,6 +375,96 @@ static void sctp_insert_list(struct list_head *head, struct list_head *new) list_add_tail(new, head); } +static int sctp_prsctp_prune_sent(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, + struct list_head *queue, int msg_len) +{ + struct sctp_chunk *chk, *temp; + + list_for_each_entry_safe(chk, temp, queue, transmitted_list) { + if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->prsctp_param <= sinfo->sinfo_timetolive) + continue; + + list_del_init(&chk->transmitted_list); + sctp_insert_list(&asoc->outqueue.abandoned, + &chk->transmitted_list); + + asoc->sent_cnt_removable--; + asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; + + if (!chk->tsn_gap_acked) { + if (chk->transport) + chk->transport->flight_size -= + sctp_data_size(chk); + asoc->outqueue.outstanding_bytes -= sctp_data_size(chk); + } + + msg_len -= SCTP_DATA_SNDSIZE(chk) + + sizeof(struct sk_buff) + + sizeof(struct sctp_chunk); + if (msg_len <= 0) + break; + } + + return msg_len; +} + +static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, + struct list_head *queue, int msg_len) +{ + struct sctp_chunk *chk, *temp; + + list_for_each_entry_safe(chk, temp, queue, list) { + if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->prsctp_param <= sinfo->sinfo_timetolive) + continue; + + list_del_init(&chk->list); + asoc->sent_cnt_removable--; + asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + + msg_len -= SCTP_DATA_SNDSIZE(chk) + + sizeof(struct sk_buff) + + sizeof(struct sctp_chunk); + sctp_chunk_free(chk); + if (msg_len <= 0) + break; + } + + return msg_len; +} + +/* Abandon the chunks according their priorities */ +void sctp_prsctp_prune(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, int msg_len) +{ + struct sctp_transport *transport; + + if (!asoc->prsctp_enable || !asoc->sent_cnt_removable) + return; + + msg_len = sctp_prsctp_prune_sent(asoc, sinfo, + &asoc->outqueue.retransmit, + msg_len); + if (msg_len <= 0) + return; + + list_for_each_entry(transport, &asoc->peer.transport_addr_list, + transports) { + msg_len = sctp_prsctp_prune_sent(asoc, sinfo, + &transport->transmitted, + msg_len); + if (msg_len <= 0) + return; + } + + sctp_prsctp_prune_unsent(asoc, sinfo, + &asoc->outqueue.out_chunk_list, + msg_len); +} + /* Mark all the eligible packets on a transport for retransmission. */ void sctp_retransmit_mark(struct sctp_outq *q, struct sctp_transport *transport, @@ -962,6 +1055,9 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) /* Mark as failed send. */ sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); + if (asoc->prsctp_enable && + SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) + asoc->sent_cnt_removable--; sctp_chunk_free(chunk); continue; } @@ -1251,6 +1347,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) tsn = ntohl(tchunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(&tchunk->transmitted_list); + if (asoc->prsctp_enable && + SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) + asoc->sent_cnt_removable--; sctp_chunk_free(tchunk); } } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index cfde934af5c5..1c96f4740e67 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -720,7 +720,8 @@ static void sctp_set_prsctp_policy(struct sctp_chunk *chunk, if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags)) chunk->prsctp_param = jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); - else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags)) + else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags) || + SCTP_PR_PRIO_ENABLED(sinfo->sinfo_flags)) chunk->prsctp_param = sinfo->sinfo_timetolive; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 08614296628a..71c7dc5ea62e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1914,6 +1914,9 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_free; } + if (sctp_wspace(asoc) < msg_len) + sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); -- cgit v1.2.3 From aa9667e7f626232cde5310435dc96bfbe796f05c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 10 Jul 2016 10:20:11 +0900 Subject: tunnels: correct conditional build of MPLS and IPv6 Using a combination if #if conditionals and goto labels to unwind tunnel4_init seems unwieldy. This patch takes a simpler approach of directly unregistering previously registered protocols when an error occurs. This fixes a number of problems with the current implementation including the potential presence of labels when they are unused and the potential absence of unregister code when it is needed. Fixes: 8afe97e5d416 ("tunnels: support MPLS over IPv4 tunnels") Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/ipv4/tunnel4.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 45cd4253583a..ec35eaa5c029 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -208,24 +208,25 @@ static const struct net_protocol tunnelmpls4_protocol = { static int __init tunnel4_init(void) { if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) - goto err_ipip; + goto err; #if IS_ENABLED(CONFIG_IPV6) - if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) - goto err_ipv6; + if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { + inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); + goto err; + } #endif #if IS_ENABLED(CONFIG_MPLS) - if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) - goto err_mpls; + if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) { + inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); +#if IS_ENABLED(CONFIG_IPV6) + inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); +#endif + goto err; + } #endif return 0; -#if IS_ENABLED(CONFIG_IPV6) -err_mpls: - inet_del_protocol(&tunnel4_protocol, IPPROTO_IPV6); -#endif -err_ipv6: - inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); -err_ipip: +err: pr_err("%s: can't add protocol\n", __func__); return -EAGAIN; } -- cgit v1.2.3 From d3fc0353f7c709a6a7fea340211eeb7bbc3e4c66 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 11 Jul 2016 16:37:51 -0400 Subject: ipv4: af_inet: make it explicitly non-modular The Makefile controlling compilation of this file is obj-y, meaning that it currently is never being built as a module. Since MODULE_ALIAS is a no-op for non-modular code, we can simply remove the MODULE_ALIAS_NETPROTO variant used here. We replace module.h with kmod.h since the file does make use of request_module() in order to load other modules from here. We don't have to worry about init.h coming in via the removed module.h since the file explicitly includes init.h already. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: netdev@vger.kernel.org Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d39e9e47a26e..55513e654d79 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -73,7 +73,7 @@ #include #include #include -#include +#include #include #include #include @@ -1916,6 +1916,3 @@ static int __init ipv4_proc_init(void) return 0; } #endif /* CONFIG_PROC_FS */ - -MODULE_ALIAS_NETPROTO(PF_INET); - -- cgit v1.2.3 From 3101e0fc1f6e809d38fbb5845c6c5eb0eefeda07 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Tue, 12 Jul 2016 19:45:00 +0800 Subject: netfilter: conntrack: protect early_drop by rcu read lock User can add ct entry via nfnetlink(IPCTNL_MSG_CT_NEW), and if the total number reach the nf_conntrack_max, we will try to drop some ct entries. But in this case(the main function call path is ctnetlink_create_conntrack -> nf_conntrack_alloc -> early_drop), rcu_read_lock is not held, so race with hash resize will happen. Fixes: 242922a02717 ("netfilter: conntrack: simplify early_drop") Cc: Florian Westphal Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e0e9c9a0f5ba..2d46225501c1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -880,6 +880,7 @@ static noinline int early_drop(struct net *net, unsigned int _hash) struct hlist_nulls_head *ct_hash; unsigned hash, sequence, drops; + rcu_read_lock(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = scale_hash(_hash++); @@ -887,6 +888,8 @@ static noinline int early_drop(struct net *net, unsigned int _hash) } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); drops = early_drop_list(net, &ct_hash[hash]); + rcu_read_unlock(); + if (drops) { NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); return true; -- cgit v1.2.3 From 8addc0440bdba9280bf212b70f17f029c2801805 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 12 Jul 2016 11:21:17 +0000 Subject: rxrpc: Fix error handling in af_rxrpc_init() security initialized after alloc workqueue, so we should exit security before destroy workqueue in the error handing. Fixes: 648af7fca159 ("rxrpc: Absorb the rxkad security module") Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/rxrpc/af_rxrpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index d6e4e3b69dc3..88effadd4b16 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -766,9 +766,9 @@ error_key_type: error_sock: proto_unregister(&rxrpc_proto); error_proto: - destroy_workqueue(rxrpc_workqueue); -error_security: rxrpc_exit_security(); +error_security: + destroy_workqueue(rxrpc_workqueue); error_work_queue: kmem_cache_destroy(rxrpc_call_jar); error_call_jar: -- cgit v1.2.3 From 85c22bad56ec0d0d8d8518414f97106233132e43 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 12 Jul 2016 15:24:10 +0000 Subject: net: dsa: Fix non static symbol warning Fixes the following sparse warning: net/dsa/dsa2.c:680:6: warning: symbol '_dsa_unregister_switch' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 78e4c0131c30..f30bad9678f0 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -677,7 +677,7 @@ int dsa_register_switch(struct dsa_switch *ds, struct device_node *np) } EXPORT_SYMBOL_GPL(dsa_register_switch); -void _dsa_unregister_switch(struct dsa_switch *ds) +static void _dsa_unregister_switch(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; -- cgit v1.2.3 From e5224f0fe2acddbc2fa9b419d8867ced7f5381fc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 Jul 2016 18:05:03 +0200 Subject: devlink: add hardware messages tracing facility Define a tracepoint and allow user to trace messages going to and from hardware associated with devlink instance. Signed-off-by: Jiri Pirko Acked-by: Steven Rostedt Signed-off-by: David S. Miller --- net/core/devlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index b2e592a198c0..1b5063088f1a 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -26,6 +26,10 @@ #include #include #include +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); static LIST_HEAD(devlink_list); -- cgit v1.2.3 From 160b925163c0aabc2c2fbb7d58a75e38b7cd6a17 Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Tue, 12 Jul 2016 02:12:16 +0200 Subject: Bluetooth: Add Authentication Failed reason to Disconnected Mgmt event If link is disconnected due to Authentication Failure (PIN or Key Missing status) userspace will be notified about this with proper error code. Many LE profiles define "PIN or Key Missing" status as indication of remote lost bond so this allows userspace to take action on this. @ Device Connected: 88:63:DF:88:0E:83 (1) flags 0x0000 02 01 1a 05 03 0a 18 0d 18 0b 09 48 65 61 72 74 ...........Heart 20 52 61 74 65 Rate > HCI Event: Command Status (0x0f) plen 4 LE Read Remote Used Features (0x08|0x0016) ncmd 1 Status: Success (0x00) > ACL Data RX: Handle 3585 flags 0x02 dlen 11 ATT: Read By Group Type Request (0x10) len 6 Handle range: 0x0001-0xffff Attribute group type: Primary Service (0x2800) > HCI Event: LE Meta Event (0x3e) plen 12 LE Read Remote Used Features (0x04) Status: Success (0x00) Handle: 3585 Features: 0x01 0x00 0x00 0x00 0x00 0x00 0x00 0x00 LE Encryption < HCI Command: LE Start Encryption (0x08|0x0019) plen 28 Handle: 3585 Random number: 0x0000000000000000 Encrypted diversifier: 0x0000 Long term key: 26201cd479a0921b6f949f0b1fa8dc82 > HCI Event: Command Status (0x0f) plen 4 LE Start Encryption (0x08|0x0019) ncmd 1 Status: Success (0x00) > HCI Event: Encryption Change (0x08) plen 4 Status: PIN or Key Missing (0x06) Handle: 3585 Encryption: Disabled (0x00) < HCI Command: Disconnect (0x01|0x0006) plen 3 Handle: 3585 Reason: Authentication Failure (0x05) > HCI Event: Command Status (0x0f) plen 4 Disconnect (0x01|0x0006) ncmd 1 Status: Success (0x00) > HCI Event: Disconnect Complete (0x05) plen 4 Status: Success (0x00) Handle: 3585 Reason: Connection Terminated By Local Host (0x16) @ Device Disconnected: 88:63:DF:88:0E:83 (1) reason 4 @ Device Connected: C4:43:8F:A3:4D:83 (0) flags 0x0000 08 09 4e 65 78 75 73 20 35 ..Nexus 5 > HCI Event: Command Status (0x0f) plen 4 Authentication Requested (0x01|0x0011) ncmd 1 Status: Success (0x00) > HCI Event: Link Key Request (0x17) plen 6 Address: C4:43:8F:A3:4D:83 (LG Electronics) < HCI Command: Link Key Request Reply (0x01|0x000b) plen 22 Address: C4:43:8F:A3:4D:83 (LG Electronics) Link key: 080812e4aa97a863d11826f71f65a933 > HCI Event: Command Complete (0x0e) plen 10 Link Key Request Reply (0x01|0x000b) ncmd 1 Status: Success (0x00) Address: C4:43:8F:A3:4D:83 (LG Electronics) > HCI Event: Auth Complete (0x06) plen 3 Status: PIN or Key Missing (0x06) Handle: 75 @ Authentication Failed: C4:43:8F:A3:4D:83 (0) status 0x05 < HCI Command: Disconnect (0x01|0x0006) plen 3 Handle: 75 Reason: Remote User Terminated Connection (0x13) > HCI Event: Command Status (0x0f) plen 4 Disconnect (0x01|0x0006) ncmd 1 Status: Success (0x00) > HCI Event: Disconnect Complete (0x05) plen 4 Status: Success (0x00) Handle: 75 Reason: Connection Terminated By Local Host (0x16) @ Device Disconnected: C4:43:8F:A3:4D:83 (0) reason 4 Signed-off-by: Szymon Janc Signed-off-by: Johan Hedberg --- net/bluetooth/hci_event.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 3fb95c47243c..e17aacbc5630 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2332,7 +2332,7 @@ static u8 hci_to_mgmt_reason(u8 err) static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_disconn_complete *ev = (void *) skb->data; - u8 reason = hci_to_mgmt_reason(ev->reason); + u8 reason; struct hci_conn_params *params; struct hci_conn *conn; bool mgmt_connected; @@ -2355,6 +2355,12 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->state = BT_CLOSED; mgmt_connected = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags); + + if (test_bit(HCI_CONN_AUTH_FAILURE, &conn->flags)) + reason = MGMT_DEV_DISCONN_AUTH_FAILURE; + else + reason = hci_to_mgmt_reason(ev->reason); + mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type, reason, mgmt_connected); @@ -2421,6 +2427,8 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) goto unlock; if (!ev->status) { + clear_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); + if (!hci_conn_ssp_enabled(conn) && test_bit(HCI_CONN_REAUTH_PEND, &conn->flags)) { BT_INFO("re-auth of legacy device is not possible."); @@ -2429,6 +2437,9 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->sec_level = conn->pending_sec_level; } } else { + if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING) + set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); + mgmt_auth_failed(conn, ev->status); } @@ -2613,6 +2624,9 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); if (ev->status && conn->state == BT_CONNECTED) { + if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING) + set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); + hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); hci_conn_drop(conn); goto unlock; -- cgit v1.2.3 From 87510973d6e137c33552b3365b5afbd5be81c5dd Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 13 Jul 2016 10:57:18 +0300 Subject: Bluetooth: Increment management interface revision Increment the mgmt revision due to the recently added new reason code for the Disconnected event. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7983ec8d4c60..7639290b6de3 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 12 +#define MGMT_REVISION 13 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, -- cgit v1.2.3 From 9e238323799fb8c2add2b1de9a22edd4d4e51e30 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 13 Jul 2016 15:08:55 -0300 Subject: sctp: allow others to use sctp_input_cb We process input path in other files too and having access to it is nice, so move it to a header where it's shared. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 6f8e676d285e..7a327ff71f08 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -90,17 +90,6 @@ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) return 0; } -struct sctp_input_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; - struct sctp_chunk *chunk; -}; -#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) - /* * This is the routine which IP calls when receiving an SCTP packet. */ -- cgit v1.2.3 From f5d258e60722142e88cb6f0f337d78bca67cf973 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 13 Jul 2016 15:08:56 -0300 Subject: sctp: reorder sctp_ulpevent and shrink msg_flags The next patch needs 8 bytes in there. sctp_ulpevent has a hole due to bad alignment; msg_flags is using 4 bytes while it actually uses only 2, so we shrink it, and iif member (4 bytes) which can be easily fetched from another place once the next patch is there, so we remove it and thus creating space for 8 bytes. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ulpevent.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index d1e38308f615..706f5bc9f0c3 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -51,7 +51,7 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event); /* Initialize an ULP event from an given skb. */ static void sctp_ulpevent_init(struct sctp_ulpevent *event, - int msg_flags, + __u16 msg_flags, unsigned int len) { memset(event, 0, sizeof(struct sctp_ulpevent)); @@ -60,7 +60,7 @@ static void sctp_ulpevent_init(struct sctp_ulpevent *event, } /* Create a new sctp_ulpevent. */ -static struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags, +static struct sctp_ulpevent *sctp_ulpevent_new(int size, __u16 msg_flags, gfp_t gfp) { struct sctp_ulpevent *event; -- cgit v1.2.3 From 1f45f78f8e511203f03138f2ccde3d2cf90d2cbf Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 13 Jul 2016 15:08:57 -0300 Subject: sctp: allow GSO frags to access the chunk too SCTP will try to access original IP headers on sctp_recvmsg in order to copy the addresses used. There are also other places that do similar access to IP or even SCTP headers. But after 90017accff61 ("sctp: Add GSO support") they aren't always there because they are only present in the header skb. SCTP handles the queueing of incoming data by cloning the incoming skb and limiting to only the relevant payload. This clone has its cb updated to something different and it's then queued on socket rx queue. Thus we need to fix this in two moments. For rx path, not related to socket queue yet, this patch uses a partially copied sctp_input_cb to such GSO frags. This restores the ability to access the headers for this part of the code. Regarding the socket rx queue, it removes iif member from sctp_event and also add a chunk pointer on it. With these changes we're always able to reach the headers again. The biggest change here is that now the sctp_chunk struct and the original skb are only freed after the application consumed the buffer. Note however that the original payload was already like this due to the skb cloning. For iif, SCTP's IPv4 code doesn't use it, so no change is necessary. IPv6 now can fetch it directly from original's IPv6 CB as the original skb is still accessible. In the future we probably can simplify sctp_v*_skb_iif() stuff, as sctp_v4_skb_iif() was called but it's return value not used, and now it's not even called, but such cleanup is out of scope for this change. Fixes: 90017accff61 ("sctp: Add GSO support") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/inqueue.c | 7 +++++++ net/sctp/ipv6.c | 9 ++++----- net/sctp/protocol.c | 1 + net/sctp/sm_statefuns.c | 3 ++- net/sctp/socket.c | 10 +++++++--- net/sctp/ulpevent.c | 10 +++++++++- 6 files changed, 30 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index edabbbdfca54..147d975b0455 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -218,6 +218,13 @@ new_skb: chunk->has_asconf = 0; chunk->end_of_packet = 0; chunk->ecn_ce_done = 0; + if (chunk->head_skb) { + struct sctp_input_cb + *cb = SCTP_INPUT_CB(chunk->skb), + *head_cb = SCTP_INPUT_CB(chunk->head_skb); + + cb->chunk = head_cb->chunk; + } } chunk->chunk_hdr = ch; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0657d18a85bf..ae6f1a2178ba 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -420,6 +420,7 @@ static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, addr->v6.sin6_flowinfo = 0; /* FIXME */ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; + /* Always called on head skb, so this is safe */ sh = sctp_hdr(skb); if (is_saddr) { *port = sh->source; @@ -710,8 +711,7 @@ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) /* Where did this skb come from? */ static int sctp_v6_skb_iif(const struct sk_buff *skb) { - struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb; - return opt->iif; + return IP6CB(skb)->iif; } /* Was this packet marked by Explicit Congestion Notification? */ @@ -780,15 +780,14 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, if (ip_hdr(skb)->version == 4) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = sh->source; - addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr; + addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr; } else { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_port = sh->source; addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) { - struct sctp_ulpevent *ev = sctp_skb2event(skb); - addr->v6.sin6_scope_id = ev->iif; + addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb); } } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 3b56ae55aba3..1adb9270e317 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -240,6 +240,7 @@ static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, port = &addr->v4.sin_port; addr->v4.sin_family = AF_INET; + /* Always called on head skb, so this is safe */ sh = sctp_hdr(skb); if (is_saddr) { *port = sh->source; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index f1f08c8f277b..5aabf42065e2 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6125,7 +6125,8 @@ static int sctp_eat_data(const struct sctp_association *asoc, af = sctp_get_af_specific( ipver2af(ip_hdr(chunk->skb)->version)); - if (af && af->is_ce(chunk->skb) && asoc->peer.ecn_capable) { + if (af && af->is_ce(sctp_gso_headskb(chunk->skb)) && + asoc->peer.ecn_capable) { /* Do real work as sideffect. */ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, SCTP_U32(tsn)); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 71c7dc5ea62e..52fdd540a9ef 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2066,7 +2066,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, { struct sctp_ulpevent *event = NULL; struct sctp_sock *sp = sctp_sk(sk); - struct sk_buff *skb; + struct sk_buff *skb, *head_skb; int copied; int err = 0; int skb_len; @@ -2102,12 +2102,16 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (err) goto out_free; - sock_recv_ts_and_drops(msg, sk, skb); + if (event->chunk && event->chunk->head_skb) + head_skb = event->chunk->head_skb; + else + head_skb = skb; + sock_recv_ts_and_drops(msg, sk, head_skb); if (sctp_ulpevent_is_notification(event)) { msg->msg_flags |= MSG_NOTIFICATION; sp->pf->event_msgname(event, msg->msg_name, addr_len); } else { - sp->pf->skb_msgname(skb, msg->msg_name, addr_len); + sp->pf->skb_msgname(head_skb, msg->msg_name, addr_len); } /* Check if we allow SCTP_NXTINFO. */ diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 706f5bc9f0c3..f6219b164b42 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -701,6 +701,12 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, sctp_ulpevent_receive_data(event, asoc); + /* And hold the chunk as we need it for getting the IP headers + * later in recvmsg + */ + sctp_chunk_hold(chunk); + event->chunk = chunk; + event->stream = ntohs(chunk->subh.data_hdr->stream); event->ssn = ntohs(chunk->subh.data_hdr->ssn); event->ppid = chunk->subh.data_hdr->ppid; @@ -710,11 +716,11 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, } event->tsn = ntohl(chunk->subh.data_hdr->tsn); event->msg_flags |= chunk->chunk_hdr->flags; - event->iif = sctp_chunk_iif(chunk); return event; fail_mark: + sctp_chunk_put(chunk); kfree_skb(skb); fail: return NULL; @@ -1007,6 +1013,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) done: sctp_assoc_rwnd_increase(event->asoc, len); + sctp_chunk_put(event->chunk); sctp_ulpevent_release_owner(event); } @@ -1029,6 +1036,7 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) } done: + sctp_chunk_put(event->chunk); sctp_ulpevent_release_owner(event); } -- cgit v1.2.3 From e7487c86dc5c4a528a7dbd9dc14f453a0de61a84 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 13 Jul 2016 15:08:58 -0300 Subject: sctp: avoid identifying address family many times for a chunk Identifying address family operations during rx path is not something expensive but it's ugly to the eye to have it done multiple times, specially when we already validated it during initial rx processing. This patch takes advantage of the now shared sctp_input_cb and make the pointer to the operations readily available. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 1 + net/sctp/inqueue.c | 1 + net/sctp/sm_make_chunk.c | 20 ++++---------------- net/sctp/sm_statefuns.c | 7 ++----- 4 files changed, 8 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 7a327ff71f08..30d72f7707b6 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -140,6 +140,7 @@ int sctp_rcv(struct sk_buff *skb) af = sctp_get_af_specific(family); if (unlikely(!af)) goto discard_it; + SCTP_INPUT_CB(skb)->af = af; /* Initialize local addresses for lookups. */ af->from_skb(&src, skb, 1); diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 147d975b0455..8fc773f9b59a 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -224,6 +224,7 @@ new_skb: *head_cb = SCTP_INPUT_CB(chunk->head_skb); cb->chunk = head_cb->chunk; + cb->af = head_cb->af; } } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 1c96f4740e67..8c77b87a8565 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -108,14 +108,9 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk) /* What was the inbound interface for this chunk? */ int sctp_chunk_iif(const struct sctp_chunk *chunk) { - struct sctp_af *af; - int iif = 0; - - af = sctp_get_af_specific(ipver2af(ip_hdr(chunk->skb)->version)); - if (af) - iif = af->skb_iif(chunk->skb); + struct sk_buff *skb = chunk->skb; - return iif; + return SCTP_INPUT_CB(skb)->af->skb_iif(skb); } /* RFC 2960 3.3.2 Initiation (INIT) (1) @@ -1600,7 +1595,6 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, struct sctp_association *asoc; struct sk_buff *skb; sctp_scope_t scope; - struct sctp_af *af; /* Create the bare association. */ scope = sctp_scope(sctp_source(chunk)); @@ -1610,16 +1604,10 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, asoc->temp = 1; skb = chunk->skb; /* Create an entry for the source address of the packet. */ - af = sctp_get_af_specific(ipver2af(ip_hdr(skb)->version)); - if (unlikely(!af)) - goto fail; - af->from_skb(&asoc->c.peer_addr, skb, 1); + SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1); + nodata: return asoc; - -fail: - sctp_association_free(asoc); - return NULL; } /* Build a cookie representing asoc. diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5aabf42065e2..b7c1f7f3c838 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6119,13 +6119,10 @@ static int sctp_eat_data(const struct sctp_association *asoc, */ if (!chunk->ecn_ce_done) { - struct sctp_af *af; + struct sctp_af *af = SCTP_INPUT_CB(chunk->skb)->af; chunk->ecn_ce_done = 1; - af = sctp_get_af_specific( - ipver2af(ip_hdr(chunk->skb)->version)); - - if (af && af->is_ce(sctp_gso_headskb(chunk->skb)) && + if (af->is_ce(sctp_gso_headskb(chunk->skb)) && asoc->peer.ecn_capable) { /* Do real work as sideffect. */ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, -- cgit v1.2.3 From d9cef42529402f9fce10376b6e427a5137d90c3d Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 13 Jul 2016 15:08:59 -0300 Subject: sctp: do not clear chunk->ecn_ce_done flag We should not clear that flag when switching to a new skb from a GSO skb because it would cause ECN processing to happen multiple times per GSO skb, which is not wanted. Instead, let it be processed once per chunk. That is, in other words, once per IP header available. Fixes: 90017accff61 ("sctp: Add GSO support") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/inqueue.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 8fc773f9b59a..942770675f4c 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -217,7 +217,6 @@ new_skb: chunk->auth = 0; chunk->has_asconf = 0; chunk->end_of_packet = 0; - chunk->ecn_ce_done = 0; if (chunk->head_skb) { struct sctp_input_cb *cb = SCTP_INPUT_CB(chunk->skb), -- cgit v1.2.3 From 2d47fd120d23390fea38c3c7cc5ee05a5b95c49f Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 13 Jul 2016 15:09:00 -0300 Subject: sctp: only check for ECN if peer is using it Currently only read-only checks are performed up to the point on where we check if peer is ECN capable, checks which we can avoid otherwise. The flag ecn_ce_done is only used to perform this check once per incoming packet, and nothing more. Thus this patch moves the peer check up. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index b7c1f7f3c838..d88bb2b0b699 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6118,12 +6118,11 @@ static int sctp_eat_data(const struct sctp_association *asoc, * chunk later. */ - if (!chunk->ecn_ce_done) { + if (asoc->peer.ecn_capable && !chunk->ecn_ce_done) { struct sctp_af *af = SCTP_INPUT_CB(chunk->skb)->af; chunk->ecn_ce_done = 1; - if (af->is_ce(sctp_gso_headskb(chunk->skb)) && - asoc->peer.ecn_capable) { + if (af->is_ce(sctp_gso_headskb(chunk->skb))) { /* Do real work as sideffect. */ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, SCTP_U32(tsn)); -- cgit v1.2.3 From 8438884d4ab423161b974854ebb90c08219dd678 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 14 Jul 2016 10:32:43 +0300 Subject: net/switchdev: Export the same parent ID service function This helper serves to know if two switchdev port netdevices belong to the same HW ASIC, e.g to figure out if forwarding offload is possible between them. Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 59658b2e9cdf..a5fc9dd24aa9 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1286,8 +1286,8 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi) } EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); -static bool switchdev_port_same_parent_id(struct net_device *a, - struct net_device *b) +bool switchdev_port_same_parent_id(struct net_device *a, + struct net_device *b) { struct switchdev_attr a_attr = { .orig_dev = a, @@ -1323,6 +1323,7 @@ static u32 switchdev_port_fwd_mark_get(struct net_device *dev, return dev->ifindex; } +EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id); static void switchdev_port_fwd_mark_reset(struct net_device *group_dev, u32 old_mark, u32 *reset_mark) -- cgit v1.2.3 From a93d01f5777e99f24b5b3948e06673ada148337c Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 14 Jul 2016 03:51:01 -0700 Subject: RDS: TCP: avoid bad page reference in rds_tcp_listen_data_ready As the existing comments in rds_tcp_listen_data_ready() indicate, it is possible under some race-windows to get to this function with the accept() socket. If that happens, we could run into a sequence whereby thread 1 thread 2 rds_tcp_accept_one() thread sets up new_sock via ->accept(). The sk_user_data is now sock_def_readable data comes in for new_sock, ->sk_data_ready is called, and we land in rds_tcp_listen_data_ready rds_tcp_set_callbacks() takes the sk_callback_lock and sets up sk_user_data to be the cp read_lock sk_callback_lock ready = cp unlock sk_callback_lock page fault on ready In the above sequence, we end up with a panic on a bad page reference when trying to execute (*ready)(). Instead we need to call sock_def_readable() safely, which is what this patch achieves. Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp.c | 7 +++++++ net/rds/tcp.h | 1 + net/rds/tcp_listen.c | 2 ++ 3 files changed, 10 insertions(+) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index d24f6c142d03..b411bb764f07 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -551,6 +551,13 @@ static void rds_tcp_kill_sock(struct net *net) } } +void *rds_tcp_listen_sock_def_readable(struct net *net) +{ + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + return rtn->rds_tcp_listen_sock->sk->sk_user_data; +} + static int rds_tcp_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 1c3160faa963..9a1cc8906576 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -70,6 +70,7 @@ void rds_tcp_listen_stop(struct socket *); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); int rds_tcp_keepalive(struct socket *sock); +void *rds_tcp_listen_sock_def_readable(struct net *net); /* tcp_recv.c */ int rds_tcp_recv_init(void); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index ca975a217a49..73040e319e4b 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -183,6 +183,8 @@ void rds_tcp_listen_data_ready(struct sock *sk) */ if (sk->sk_state == TCP_LISTEN) rds_tcp_accept_work(sk); + else + ready = rds_tcp_listen_sock_def_readable(sock_net(sk)); out: read_unlock_bh(&sk->sk_callback_lock); -- cgit v1.2.3 From ac3615e7f3cffe2a1a6b25172dfd09e138593d82 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 14 Jul 2016 03:51:02 -0700 Subject: RDS: TCP: Reduce code duplication in rds_tcp_reset_callbacks() Some code duplication in rds_tcp_reset_callbacks() can be avoided by having the function call rds_tcp_restore_callbacks() and rds_tcp_set_callbacks(). Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index b411bb764f07..0a683cfc4f23 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -168,35 +168,21 @@ void rds_tcp_reset_callbacks(struct socket *sock, wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags)); lock_sock(osock->sk); /* reset receive side state for rds_tcp_data_recv() for osock */ + cancel_delayed_work_sync(&cp->cp_send_w); + cancel_delayed_work_sync(&cp->cp_recv_w); if (tc->t_tinc) { rds_inc_put(&tc->t_tinc->ti_inc); tc->t_tinc = NULL; } tc->t_tinc_hdr_rem = sizeof(struct rds_header); tc->t_tinc_data_rem = 0; - tc->t_sock = NULL; - - write_lock_bh(&osock->sk->sk_callback_lock); - - osock->sk->sk_user_data = NULL; - osock->sk->sk_data_ready = tc->t_orig_data_ready; - osock->sk->sk_write_space = tc->t_orig_write_space; - osock->sk->sk_state_change = tc->t_orig_state_change; - write_unlock_bh(&osock->sk->sk_callback_lock); + rds_tcp_restore_callbacks(osock, tc); release_sock(osock->sk); sock_release(osock); newsock: rds_send_path_reset(cp); lock_sock(sock->sk); - write_lock_bh(&sock->sk->sk_callback_lock); - tc->t_sock = sock; - tc->t_cpath = cp; - sock->sk->sk_user_data = cp; - sock->sk->sk_data_ready = rds_tcp_data_ready; - sock->sk->sk_write_space = rds_tcp_write_space; - sock->sk->sk_state_change = rds_tcp_state_change; - - write_unlock_bh(&sock->sk->sk_callback_lock); + rds_tcp_set_callbacks(sock, cp); release_sock(sock->sk); } -- cgit v1.2.3 From 5916e2c1554f3e36f770401c989c3c7fadf619ca Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 14 Jul 2016 03:51:03 -0700 Subject: RDS: TCP: Enable multipath RDS for TCP Use RDS probe-ping to compute how many paths may be used with the peer, and to synchronously start the multiple paths. If mprds is supported, hash outgoing traffic to one of multiple paths in rds_sendmsg() when multipath RDS is supported by the transport. CC: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/bind.c | 6 +++++ net/rds/connection.c | 17 ++++++------ net/rds/message.c | 1 + net/rds/rds.h | 25 +++++++++++++++-- net/rds/recv.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++ net/rds/send.c | 71 +++++++++++++++++++++++++++++++++++++++++++++--- net/rds/tcp.c | 2 +- net/rds/tcp_connect.c | 7 ++++- net/rds/tcp_listen.c | 63 +++++++++++++++++++++++++++++++++++++------ net/rds/tcp_send.c | 18 +++++++++---- net/rds/threads.c | 2 ++ 11 files changed, 257 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/rds/bind.c b/net/rds/bind.c index b22ea956522b..095f6ce583fe 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -81,6 +81,8 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) if (*port != 0) { rover = be16_to_cpu(*port); + if (rover == RDS_FLAG_PROBE_PORT) + return -EINVAL; last = rover; } else { rover = max_t(u16, prandom_u32(), 2); @@ -91,12 +93,16 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) if (rover == 0) rover++; + if (rover == RDS_FLAG_PROBE_PORT) + continue; key = ((u64)addr << 32) | cpu_to_be16(rover); if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms)) continue; rs->rs_bound_key = key; rs->rs_bound_addr = addr; + net_get_random_once(&rs->rs_hash_initval, + sizeof(rs->rs_hash_initval)); rs->rs_bound_port = cpu_to_be16(rover); rs->rs_bound_node.next = NULL; rds_sock_addref(rs); diff --git a/net/rds/connection.c b/net/rds/connection.c index 19a4fee5f4dd..f5058559bb08 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -155,7 +155,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, struct hlist_head *head = rds_conn_bucket(laddr, faddr); struct rds_transport *loop_trans; unsigned long flags; - int ret; + int ret, i; rcu_read_lock(); conn = rds_conn_lookup(net, head, laddr, faddr, trans); @@ -211,6 +211,12 @@ static struct rds_connection *__rds_conn_create(struct net *net, conn->c_trans = trans; + init_waitqueue_head(&conn->c_hs_waitq); + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + __rds_conn_path_init(conn, &conn->c_path[i], + is_outgoing); + conn->c_path[i].cp_index = i; + } ret = trans->conn_alloc(conn, gfp); if (ret) { kmem_cache_free(rds_conn_slab, conn); @@ -263,14 +269,6 @@ static struct rds_connection *__rds_conn_create(struct net *net, kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - int i; - - for (i = 0; i < RDS_MPATH_WORKERS; i++) { - __rds_conn_path_init(conn, &conn->c_path[i], - is_outgoing); - conn->c_path[i].cp_index = i; - } - hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; @@ -668,6 +666,7 @@ EXPORT_SYMBOL_GPL(rds_conn_path_drop); void rds_conn_drop(struct rds_connection *conn) { + WARN_ON(conn->c_trans->t_mp_capable); rds_conn_path_drop(&conn->c_path[0]); } EXPORT_SYMBOL_GPL(rds_conn_drop); diff --git a/net/rds/message.c b/net/rds/message.c index 756c73729126..6cb91061556a 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -41,6 +41,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), +[RDS_EXTHDR_NPATHS] = sizeof(u16), }; diff --git a/net/rds/rds.h b/net/rds/rds.h index 6ef07bd27227..b2d17f0fafa8 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -85,7 +85,9 @@ enum { #define RDS_RECV_REFILL 3 /* Max number of multipaths per RDS connection. Must be a power of 2 */ -#define RDS_MPATH_WORKERS 1 +#define RDS_MPATH_WORKERS 8 +#define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \ + (rs)->rs_hash_initval) & ((n) - 1)) /* Per mpath connection state */ struct rds_conn_path { @@ -131,7 +133,8 @@ struct rds_connection { __be32 c_laddr; __be32 c_faddr; unsigned int c_loopback:1, - c_pad_to_32:31; + c_ping_triggered:1, + c_pad_to_32:30; int c_npaths; struct rds_connection *c_passive; struct rds_transport *c_trans; @@ -147,6 +150,7 @@ struct rds_connection { unsigned long c_map_queued; struct rds_conn_path c_path[RDS_MPATH_WORKERS]; + wait_queue_head_t c_hs_waitq; /* handshake waitq */ }; static inline @@ -166,6 +170,17 @@ void rds_conn_net_set(struct rds_connection *conn, struct net *net) #define RDS_FLAG_RETRANSMITTED 0x04 #define RDS_MAX_ADV_CREDIT 255 +/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping + * probe to exchange control information before establishing a connection. + * Currently the control information that is exchanged is the number of + * supported paths. If the peer is a legacy (older kernel revision) peer, + * it would return a pong message without additional control information + * that would then alert the sender that the peer was an older rev. + */ +#define RDS_FLAG_PROBE_PORT 1 +#define RDS_HS_PROBE(sport, dport) \ + ((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \ + (sport == 0 && dport == RDS_FLAG_PROBE_PORT)) /* * Maximum space available for extension headers. */ @@ -225,6 +240,11 @@ struct rds_ext_header_rdma_dest { __be32 h_rdma_offset; }; +/* Extension header announcing number of paths. + * Implicit length = 2 bytes. + */ +#define RDS_EXTHDR_NPATHS 4 + #define __RDS_EXTHDR_MAX 16 /* for now */ struct rds_incoming { @@ -545,6 +565,7 @@ struct rds_sock { /* Socket options - in case there will be more */ unsigned char rs_recverr, rs_cong_monitor; + u32 rs_hash_initval; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) diff --git a/net/rds/recv.c b/net/rds/recv.c index fed53a6c2890..cbfabdf3ff48 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -156,6 +156,67 @@ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock } } +static void rds_recv_hs_exthdrs(struct rds_header *hdr, + struct rds_connection *conn) +{ + unsigned int pos = 0, type, len; + union { + struct rds_ext_header_version version; + u16 rds_npaths; + } buffer; + + while (1) { + len = sizeof(buffer); + type = rds_message_next_extension(hdr, &pos, &buffer, &len); + if (type == RDS_EXTHDR_NONE) + break; + /* Process extension header here */ + switch (type) { + case RDS_EXTHDR_NPATHS: + conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, + buffer.rds_npaths); + break; + default: + pr_warn_ratelimited("ignoring unknown exthdr type " + "0x%x\n", type); + } + } + /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ + conn->c_npaths = max_t(int, conn->c_npaths, 1); +} + +/* rds_start_mprds() will synchronously start multiple paths when appropriate. + * The scheme is based on the following rules: + * + * 1. rds_sendmsg on first connect attempt sends the probe ping, with the + * sender's npaths (s_npaths) + * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It + * sends back a probe-pong with r_npaths. After that, if rcvr is the + * smaller ip addr, it starts rds_conn_path_connect_if_down on all + * mprds_paths. + * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down. + * If it is the smaller ipaddr, rds_conn_path_connect_if_down can be + * called after reception of the probe-pong on all mprds_paths. + * Otherwise (sender of probe-ping is not the smaller ip addr): just call + * rds_conn_path_connect_if_down on the hashed path. (see rule 4) + * 4. when cp_index > 0, rds_connect_worker must only trigger + * a connection if laddr < faddr. + * 5. sender may end up queuing the packet on the cp. will get sent out later. + * when connection is completed. + */ +static void rds_start_mprds(struct rds_connection *conn) +{ + int i; + struct rds_conn_path *cp; + + if (conn->c_npaths > 1 && conn->c_laddr < conn->c_faddr) { + for (i = 1; i < conn->c_npaths; i++) { + cp = &conn->c_path[i]; + rds_conn_path_connect_if_down(cp); + } + } +} + /* * The transport must make sure that this is serialized against other * rx and conn reset on this specific conn. @@ -232,6 +293,20 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, } rds_stats_inc(s_recv_ping); rds_send_pong(cp, inc->i_hdr.h_sport); + /* if this is a handshake ping, start multipath if necessary */ + if (RDS_HS_PROBE(inc->i_hdr.h_sport, inc->i_hdr.h_dport)) { + rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); + rds_start_mprds(cp->cp_conn); + } + goto out; + } + + if (inc->i_hdr.h_dport == RDS_FLAG_PROBE_PORT && + inc->i_hdr.h_sport == 0) { + rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); + /* if this is a handshake pong, start multipath if necessary */ + rds_start_mprds(cp->cp_conn); + wake_up(&cp->cp_conn->c_hs_waitq); goto out; } diff --git a/net/rds/send.c b/net/rds/send.c index 5a9caf1da896..896626b9a0ef 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -963,6 +963,29 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, return ret; } +static void rds_send_ping(struct rds_connection *conn); + +static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn) +{ + int hash; + + if (conn->c_npaths == 0) + hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS); + else + hash = RDS_MPATH_HASH(rs, conn->c_npaths); + if (conn->c_npaths == 0 && hash != 0) { + rds_send_ping(conn); + + if (conn->c_npaths == 0) { + wait_event_interruptible(conn->c_hs_waitq, + (conn->c_npaths != 0)); + } + if (conn->c_npaths == 1) + hash = 0; + } + return hash; +} + int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; @@ -1075,7 +1098,10 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } - cpath = &conn->c_path[0]; + if (conn->c_trans->t_mp_capable) + cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)]; + else + cpath = &conn->c_path[0]; rds_conn_path_connect_if_down(cpath); @@ -1135,10 +1161,16 @@ out: } /* - * Reply to a ping packet. + * send out a probe. Can be shared by rds_send_ping, + * rds_send_pong, rds_send_hb. + * rds_send_hb should use h_flags + * RDS_FLAG_HB_PING|RDS_FLAG_ACK_REQUIRED + * or + * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED */ int -rds_send_pong(struct rds_conn_path *cp, __be16 dport) +rds_send_probe(struct rds_conn_path *cp, __be16 sport, + __be16 dport, u8 h_flags) { struct rds_message *rm; unsigned long flags; @@ -1166,9 +1198,18 @@ rds_send_pong(struct rds_conn_path *cp, __be16 dport) rm->m_inc.i_conn = cp->cp_conn; rm->m_inc.i_conn_path = cp; - rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport, + rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, cp->cp_next_tx_seq); + rm->m_inc.i_hdr.h_flags |= h_flags; cp->cp_next_tx_seq++; + + if (RDS_HS_PROBE(sport, dport) && cp->cp_conn->c_trans->t_mp_capable) { + u16 npaths = RDS_MPATH_WORKERS; + + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_NPATHS, &npaths, + sizeof(npaths)); + } spin_unlock_irqrestore(&cp->cp_lock, flags); rds_stats_inc(s_send_queued); @@ -1185,3 +1226,25 @@ out: rds_message_put(rm); return ret; } + +int +rds_send_pong(struct rds_conn_path *cp, __be16 dport) +{ + return rds_send_probe(cp, 0, dport, 0); +} + +void +rds_send_ping(struct rds_connection *conn) +{ + unsigned long flags; + struct rds_conn_path *cp = &conn->c_path[0]; + + spin_lock_irqsave(&cp->cp_lock, flags); + if (conn->c_ping_triggered) { + spin_unlock_irqrestore(&cp->cp_lock, flags); + return; + } + conn->c_ping_triggered = 1; + spin_unlock_irqrestore(&cp->cp_lock, flags); + rds_send_probe(&conn->c_path[0], RDS_FLAG_PROBE_PORT, 0, 0); +} diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 0a683cfc4f23..fcddacc92e01 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -38,7 +38,6 @@ #include #include -#include "rds_single_path.h" #include "rds.h" #include "tcp.h" @@ -358,6 +357,7 @@ struct rds_transport rds_tcp_transport = { .t_name = "tcp", .t_type = RDS_TRANS_TCP, .t_prefer_loopback = 1, + .t_mp_capable = 1, }; static int rds_tcp_netid; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index c916715fbe61..05f61c533ed3 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -34,7 +34,6 @@ #include #include -#include "rds_single_path.h" #include "rds.h" #include "tcp.h" @@ -82,6 +81,12 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) struct rds_connection *conn = cp->cp_conn; struct rds_tcp_connection *tc = cp->cp_transport_data; + /* for multipath rds,we only trigger the connection after + * the handshake probe has determined the number of paths. + */ + if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2) + return -EAGAIN; + mutex_lock(&tc->t_conn_path_lock); if (rds_conn_path_up(cp)) { diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 73040e319e4b..e0b23fb5b8d5 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -35,7 +35,6 @@ #include #include -#include "rds_single_path.h" #include "rds.h" #include "tcp.h" @@ -71,6 +70,52 @@ bail: return ret; } +/* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the + * client's ipaddr < server's ipaddr. Otherwise, close the accepted + * socket and force a reconneect from smaller -> larger ip addr. The reason + * we special case cp_index 0 is to allow the rds probe ping itself to itself + * get through efficiently. + * Since reconnects are only initiated from the node with the numerically + * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side + * by moving them to CONNECTING in this function. + */ +struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) +{ + int i; + bool peer_is_smaller = (conn->c_faddr < conn->c_laddr); + int npaths = conn->c_npaths; + + if (npaths <= 1) { + struct rds_conn_path *cp = &conn->c_path[0]; + int ret; + + ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, + RDS_CONN_CONNECTING); + if (!ret) + rds_conn_path_transition(cp, RDS_CONN_ERROR, + RDS_CONN_CONNECTING); + return cp->cp_transport_data; + } + + /* for mprds, paths with cp_index > 0 MUST be initiated by the peer + * with the smaller address. + */ + if (!peer_is_smaller) + return NULL; + + for (i = 1; i < npaths; i++) { + struct rds_conn_path *cp = &conn->c_path[i]; + + if (rds_conn_path_transition(cp, RDS_CONN_DOWN, + RDS_CONN_CONNECTING) || + rds_conn_path_transition(cp, RDS_CONN_ERROR, + RDS_CONN_CONNECTING)) { + return cp->cp_transport_data; + } + } + return NULL; +} + int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; @@ -120,12 +165,14 @@ int rds_tcp_accept_one(struct socket *sock) * If the client reboots, this conn will need to be cleaned up. * rds_tcp_state_change() will do that cleanup */ - rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; - cp = &conn->c_path[0]; - rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); + rs_tcp = rds_tcp_accept_one_path(conn); + if (!rs_tcp) + goto rst_nsk; mutex_lock(&rs_tcp->t_conn_path_lock); - conn_state = rds_conn_state(conn); - if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP) + cp = rs_tcp->t_cpath; + conn_state = rds_conn_path_state(cp); + if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP && + conn_state != RDS_CONN_ERROR) goto rst_nsk; if (rs_tcp->t_sock) { /* Need to resolve a duelling SYN between peers. @@ -135,11 +182,11 @@ int rds_tcp_accept_one(struct socket *sock) * c_transport_data. */ if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) || - !conn->c_path[0].cp_outgoing) { + !cp->cp_outgoing) { goto rst_nsk; } else { rds_tcp_reset_callbacks(new_sock, cp); - conn->c_path[0].cp_outgoing = 0; + cp->cp_outgoing = 0; /* rds_connect_path_complete() marks RDS_CONN_UP */ rds_connect_path_complete(cp, RDS_CONN_RESETTING); } diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 57e0f5826406..89d09b481f47 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -81,7 +81,8 @@ static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_conn_path *cp = rm->m_inc.i_conn_path; + struct rds_tcp_connection *tc = cp->cp_transport_data; int done = 0; int ret = 0; int more; @@ -150,10 +151,17 @@ out: rds_tcp_stats_inc(s_tcp_sndbuf_full); ret = 0; } else { - printk(KERN_WARNING "RDS/tcp: send to %pI4 " - "returned %d, disconnecting and reconnecting\n", - &conn->c_faddr, ret); - rds_conn_drop(conn); + /* No need to disconnect/reconnect if path_drop + * has already been triggered, because, e.g., of + * an incoming RST. + */ + if (rds_conn_path_up(cp)) { + pr_warn("RDS/tcp: send to %pI4 on cp [%d]" + "returned %d, " + "disconnecting and reconnecting\n", + &conn->c_faddr, cp->cp_index, ret); + rds_conn_path_drop(cp); + } } } if (done == 0) diff --git a/net/rds/threads.c b/net/rds/threads.c index bc97d67f29cc..e42df11bf30a 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -156,6 +156,8 @@ void rds_connect_worker(struct work_struct *work) struct rds_connection *conn = cp->cp_conn; int ret; + if (cp->cp_index > 1 && cp->cp_conn->c_laddr > cp->cp_conn->c_faddr) + return; clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); if (ret) { -- cgit v1.2.3 From 7acef60455c4814a52afb8629d166a3b4dfa0ebb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 14 Jul 2016 15:47:01 +0100 Subject: rxrpc: checking for IS_ERR() instead of NULL The rxrpc_lookup_peer() function returns NULL on error, it never returns error pointers. Fixes: 8496af50eb38 ('rxrpc: Use RCU to access a peer's service connection tree') Signed-off-by: Dan Carpenter Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/conn_service.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 7cbd612be0d7..fd9027ccba8f 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -163,7 +163,7 @@ struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, if (!peer) { peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); - if (IS_ERR(peer)) + if (!peer) goto enomem; } -- cgit v1.2.3 From 555c8a8623a3a87b3c990ba30b7fd2e5914e41d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Jul 2016 18:08:05 +0200 Subject: bpf: avoid stack copy and use skb ctx for event output This work addresses a couple of issues bpf_skb_event_output() helper currently has: i) We need two copies instead of just a single one for the skb data when it should be part of a sample. The data can be non-linear and thus needs to be extracted via bpf_skb_load_bytes() helper first, and then copied once again into the ring buffer slot. ii) Since bpf_skb_load_bytes() currently needs to be used first, the helper needs to see a constant size on the passed stack buffer to make sure BPF verifier can do sanity checks on it during verification time. Thus, just passing skb->len (or any other non-constant value) wouldn't work, but changing bpf_skb_load_bytes() is also not the proper solution, since the two copies are generally still needed. iii) bpf_skb_load_bytes() is just for rather small buffers like headers, since they need to sit on the limited BPF stack anyway. Instead of working around in bpf_skb_load_bytes(), this work improves the bpf_skb_event_output() helper to address all 3 at once. We can make use of the passed in skb context that we have in the helper anyway, and use some of the reserved flag bits as a length argument. The helper will use the new __output_custom() facility from perf side with bpf_skb_copy() as callback helper to walk and extract the data. It will pass the data for setup to bpf_event_output(), which generates and pushes the raw record with an additional frag part. The linear data used in the first frag of the record serves as programmatically defined meta data passed along with the appended sample. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 10c4a2f9e8bb..22e3992c8b48 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2025,6 +2025,47 @@ bool bpf_helper_changes_skb_data(void *func) return false; } +static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, + unsigned long len) +{ + void *ptr = skb_header_pointer(skb, 0, len, dst_buff); + + if (unlikely(!ptr)) + return len; + if (ptr != dst_buff) + memcpy(dst_buff, ptr, len); + + return 0; +} + +static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, + u64 meta_size) +{ + struct sk_buff *skb = (struct sk_buff *)(long) r1; + struct bpf_map *map = (struct bpf_map *)(long) r2; + u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + void *meta = (void *)(long) r4; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + if (unlikely(skb_size > skb->len)) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, + bpf_skb_copy); +} + +static const struct bpf_func_proto bpf_skb_event_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; @@ -2357,7 +2398,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_perf_event_output: - return bpf_get_event_output_proto(); + return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; #ifdef CONFIG_SOCK_CGROUP_DATA -- cgit v1.2.3 From c380d37e97e783e36a924279fbd2f6837508546a Mon Sep 17 00:00:00 2001 From: Richard Sailer Date: Sat, 16 Jul 2016 04:04:34 +0200 Subject: tcp_timer.c: Add kernel-doc function descriptions This adds kernel-doc style descriptions for 6 functions and fixes 1 typo. Signed-off-by: Richard Sailer Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 81 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index debdd8b33e69..d84930b2dd95 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -24,6 +24,13 @@ int sysctl_tcp_thin_linear_timeouts __read_mostly; +/** + * tcp_write_err() - close socket and save error info + * @sk: The socket the error has appeared on. + * + * Returns: Nothing (void) + */ + static void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; @@ -33,16 +40,21 @@ static void tcp_write_err(struct sock *sk) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } -/* Do not allow orphaned sockets to eat all our resources. - * This is direct violation of TCP specs, but it is required - * to prevent DoS attacks. It is called when a retransmission timeout - * or zero probe timeout occurs on orphaned socket. +/** + * tcp_out_of_resources() - Close socket if out of resources + * @sk: pointer to current socket + * @do_reset: send a last packet with reset flag * - * Criteria is still not confirmed experimentally and may change. - * We kill the socket, if: - * 1. If number of orphaned sockets exceeds an administratively configured - * limit. - * 2. If we have strong memory pressure. + * Do not allow orphaned sockets to eat all our resources. + * This is direct violation of TCP specs, but it is required + * to prevent DoS attacks. It is called when a retransmission timeout + * or zero probe timeout occurs on orphaned socket. + * + * Criteria is still not confirmed experimentally and may change. + * We kill the socket, if: + * 1. If number of orphaned sockets exceeds an administratively configured + * limit. + * 2. If we have strong memory pressure. */ static int tcp_out_of_resources(struct sock *sk, bool do_reset) { @@ -74,7 +86,11 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) return 0; } -/* Calculate maximal number or retries on an orphaned socket. */ +/** + * tcp_orphan_retries() - Returns maximal number of retries on an orphaned socket + * @sk: Pointer to the current socket. + * @alive: bool, socket alive state + */ static int tcp_orphan_retries(struct sock *sk, bool alive) { int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */ @@ -115,10 +131,22 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) } } -/* This function calculates a "timeout" which is equivalent to the timeout of a - * TCP connection after "boundary" unsuccessful, exponentially backed-off + +/** + * retransmits_timed_out() - returns true if this connection has timed out + * @sk: The current socket + * @boundary: max number of retransmissions + * @timeout: A custom timeout value. + * If set to 0 the default timeout is calculated and used. + * Using TCP_RTO_MIN and the number of unsuccessful retransmits. + * @syn_set: true if the SYN Bit was set. + * + * The default "timeout" value this function can calculate and use + * is equivalent to the timeout of a TCP Connection + * after "boundary" unsuccessful, exponentially backed-off * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if * syn_set flag is set. + * */ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, @@ -257,6 +285,16 @@ out: sk_mem_reclaim(sk); } + +/** + * tcp_delack_timer() - The TCP delayed ACK timeout handler + * @data: Pointer to the current socket. (gets casted to struct sock *) + * + * This function gets (indirectly) called when the kernel timer for a TCP packet + * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work. + * + * Returns: Nothing (void) + */ static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock *)data; @@ -350,10 +388,18 @@ static void tcp_fastopen_synack_timer(struct sock *sk) TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); } -/* - * The TCP retransmit timer. - */ +/** + * tcp_retransmit_timer() - The TCP retransmit timeout handler + * @sk: Pointer to the current socket. + * + * This function gets called when the kernel timer for a TCP packet + * of this socket expires. + * + * It handles retransmission, timer adjustment and other necesarry measures. + * + * Returns: Nothing (void) + */ void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -494,7 +540,8 @@ out_reset_timer: out:; } -/* Called with BH disabled */ +/* Called with bottom-half processing disabled. + Called by tcp_write_timer() */ void tcp_write_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -539,7 +586,7 @@ static void tcp_write_timer(unsigned long data) if (!sock_owned_by_user(sk)) { tcp_write_timer_handler(sk); } else { - /* deleguate our work to tcp_release_cb() */ + /* delegate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) sock_hold(sk); } -- cgit v1.2.3 From 46c0772d85306f2edec03c8fa40a6efa6af915bc Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 14 Jul 2016 06:09:59 +0300 Subject: net: bridge: minor style adjustments in br_handle_frame_finish Trivial style changes in br_handle_frame_finish. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_input.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index a7817e6f306f..0b6d32619468 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -131,11 +131,11 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - const unsigned char *dest = eth_hdr(skb)->h_dest; struct net_bridge_port *p = br_port_get_rcu(skb->dev); - struct net_bridge *br; - struct net_bridge_fdb_entry *dst; + const unsigned char *dest = eth_hdr(skb)->h_dest; + struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mdb_entry *mdst; + struct net_bridge *br; struct sk_buff *skb2; bool unicast = true; u16 vid = 0; @@ -166,8 +166,6 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (br->dev->flags & IFF_PROMISC) skb2 = skb; - dst = NULL; - if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP)) br_do_proxy_arp(skb, br, vid, p); @@ -185,13 +183,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb skb = NULL; if (!skb2) goto out; - } else + } else { skb2 = skb; - + } unicast = false; br->dev->stats.multicast++; - } else if ((dst = __br_fdb_get(br, dest, vid)) && - dst->is_local) { + } else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) { skb2 = skb; /* Do not forward the packet since it's local. */ skb = NULL; @@ -201,8 +198,9 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (dst) { dst->used = jiffies; br_forward(dst->dst, skb, skb2); - } else + } else { br_flood_forward(br, skb, skb2, unicast); + } } if (skb2) -- cgit v1.2.3 From e151aab9b5b3fae96b0fcd6cbe3a7f952d6cb8f8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 14 Jul 2016 06:10:00 +0300 Subject: net: bridge: rearrange flood vs unicast receive paths This patch removes one conditional from the unicast path by using the fact that skb is NULL only when the packet is multicast or is local. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_input.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 0b6d32619468..c20c5be6fc22 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -134,10 +134,10 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_port *p = br_port_get_rcu(skb->dev); const unsigned char *dest = eth_hdr(skb)->h_dest; struct net_bridge_fdb_entry *dst = NULL; + bool mcast_hit = false, unicast = true; struct net_bridge_mdb_entry *mdst; struct net_bridge *br; struct sk_buff *skb2; - bool unicast = true; u16 vid = 0; if (!p || p->state == BR_STATE_DISABLED) @@ -177,30 +177,29 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(br, eth_hdr(skb))) { if ((mdst && mdst->mglist) || - br_multicast_is_router(br)) + br_multicast_is_router(br)) { skb2 = skb; - br_multicast_forward(mdst, skb, skb2); - skb = NULL; - if (!skb2) - goto out; + br->dev->stats.multicast++; + } + mcast_hit = true; } else { skb2 = skb; + br->dev->stats.multicast++; } unicast = false; - br->dev->stats.multicast++; } else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) { - skb2 = skb; /* Do not forward the packet since it's local. */ - skb = NULL; + return br_pass_frame_up(skb); } - if (skb) { - if (dst) { - dst->used = jiffies; - br_forward(dst->dst, skb, skb2); - } else { + if (dst) { + dst->used = jiffies; + br_forward(dst->dst, skb, skb2); + } else { + if (!mcast_hit) br_flood_forward(br, skb, skb2, unicast); - } + else + br_multicast_forward(mdst, skb, skb2); } if (skb2) -- cgit v1.2.3 From b35c5f632b630183396a2ea2e2247ff8bbf2c94f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 14 Jul 2016 06:10:01 +0300 Subject: net: bridge: drop skb2/skb0 variables and use a local_rcv boolean Currently if the packet is going to be received locally we set skb0 or sometimes called skb2 variables to the original skb. This can get confusing and also we can avoid one conditional on the fast path by simply using a boolean and passing it around. Thanks to Roopa for the name suggestion. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 35 ++++++++++++++++++----------------- net/bridge/br_input.c | 25 ++++++++++--------------- net/bridge/br_private.h | 10 +++++----- 3 files changed, 33 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index d610644368b9..204f99304a8a 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -138,17 +138,18 @@ void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) EXPORT_SYMBOL_GPL(br_deliver); /* called with rcu_read_lock */ -void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0) +void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, + bool local_rcv) { if (to && should_deliver(to, skb)) { - if (skb0) + if (local_rcv) deliver_clone(to, skb, __br_forward); else __br_forward(to, skb); return; } - if (!skb0) + if (!local_rcv) kfree_skb(skb); } @@ -193,10 +194,9 @@ out: /* called under bridge lock */ static void br_flood(struct net_bridge *br, struct sk_buff *skb, - struct sk_buff *skb0, void (*__packet_hook)(const struct net_bridge_port *p, struct sk_buff *skb), - bool unicast) + bool local_rcv, bool unicast) { u8 igmp_type = br_multicast_igmp_type(skb); struct net_bridge_port *prev; @@ -227,14 +227,14 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, if (!prev) goto out; - if (skb0) + if (local_rcv) deliver_clone(prev, skb, __packet_hook); else __packet_hook(prev, skb); return; out: - if (!skb0) + if (!local_rcv) kfree_skb(skb); } @@ -242,23 +242,24 @@ out: /* called with rcu_read_lock */ void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast) { - br_flood(br, skb, NULL, __br_deliver, unicast); + br_flood(br, skb, __br_deliver, false, unicast); } /* called under bridge lock */ void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, - struct sk_buff *skb2, bool unicast) + bool local_rcv, bool unicast) { - br_flood(br, skb, skb2, __br_forward, unicast); + br_flood(br, skb, __br_forward, local_rcv, unicast); } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING /* called with rcu_read_lock */ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, struct sk_buff *skb0, + struct sk_buff *skb, void (*__packet_hook)( const struct net_bridge_port *p, - struct sk_buff *skb)) + struct sk_buff *skb), + bool local_rcv) { struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; u8 igmp_type = br_multicast_igmp_type(skb); @@ -295,14 +296,14 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, if (!prev) goto out; - if (skb0) + if (local_rcv) deliver_clone(prev, skb, __packet_hook); else __packet_hook(prev, skb); return; out: - if (!skb0) + if (!local_rcv) kfree_skb(skb); } @@ -310,13 +311,13 @@ out: void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb) { - br_multicast_flood(mdst, skb, NULL, __br_deliver); + br_multicast_flood(mdst, skb, __br_deliver, false); } /* called with rcu_read_lock */ void br_multicast_forward(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, struct sk_buff *skb2) + struct sk_buff *skb, bool local_rcv) { - br_multicast_flood(mdst, skb, skb2, __br_forward); + br_multicast_flood(mdst, skb, __br_forward, local_rcv); } #endif diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index c20c5be6fc22..dd8885def11b 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -131,13 +131,12 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + bool local_rcv = false, mcast_hit = false, unicast = true; struct net_bridge_port *p = br_port_get_rcu(skb->dev); const unsigned char *dest = eth_hdr(skb)->h_dest; struct net_bridge_fdb_entry *dst = NULL; - bool mcast_hit = false, unicast = true; struct net_bridge_mdb_entry *mdst; struct net_bridge *br; - struct sk_buff *skb2; u16 vid = 0; if (!p || p->state == BR_STATE_DISABLED) @@ -160,17 +159,13 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb BR_INPUT_SKB_CB(skb)->brdev = br->dev; - /* The packet skb2 goes to the local host (NULL to skip). */ - skb2 = NULL; - - if (br->dev->flags & IFF_PROMISC) - skb2 = skb; + local_rcv = !!(br->dev->flags & IFF_PROMISC); if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP)) br_do_proxy_arp(skb, br, vid, p); if (is_broadcast_ether_addr(dest)) { - skb2 = skb; + local_rcv = true; unicast = false; } else if (is_multicast_ether_addr(dest)) { mdst = br_mdb_get(br, skb, vid); @@ -178,12 +173,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb br_multicast_querier_exists(br, eth_hdr(skb))) { if ((mdst && mdst->mglist) || br_multicast_is_router(br)) { - skb2 = skb; + local_rcv = true; br->dev->stats.multicast++; } mcast_hit = true; } else { - skb2 = skb; + local_rcv = true; br->dev->stats.multicast++; } unicast = false; @@ -194,16 +189,16 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (dst) { dst->used = jiffies; - br_forward(dst->dst, skb, skb2); + br_forward(dst->dst, skb, local_rcv); } else { if (!mcast_hit) - br_flood_forward(br, skb, skb2, unicast); + br_flood_forward(br, skb, local_rcv, unicast); else - br_multicast_forward(mdst, skb, skb2); + br_multicast_forward(mdst, skb, local_rcv); } - if (skb2) - return br_pass_frame_up(skb2); + if (local_rcv) + return br_pass_frame_up(skb); out: return 0; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 40f200947ddc..4d6cdf459e57 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -507,12 +507,12 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, /* br_forward.c */ void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb); int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb); -void br_forward(const struct net_bridge_port *to, - struct sk_buff *skb, struct sk_buff *skb0); +void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, + bool local_rcv); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb); void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast); void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, - struct sk_buff *skb2, bool unicast); + bool local_rcv, bool unicast); /* br_if.c */ void br_port_carrier_check(struct net_bridge_port *p); @@ -563,7 +563,7 @@ void br_multicast_dev_del(struct net_bridge *br); void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb); void br_multicast_forward(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, struct sk_buff *skb2); + struct sk_buff *skb, bool local_rcv); int br_multicast_set_router(struct net_bridge *br, unsigned long val); int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val); int br_multicast_toggle(struct net_bridge *br, unsigned long val); @@ -698,7 +698,7 @@ static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, static inline void br_multicast_forward(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, - struct sk_buff *skb2) + bool local_rcv) { } static inline bool br_multicast_is_router(struct net_bridge *br) -- cgit v1.2.3 From 37b090e6be2dc98ccb55bb663931546282abf2e8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 14 Jul 2016 06:10:02 +0300 Subject: net: bridge: remove _deliver functions and consolidate forward code Before this patch we had two flavors of most forwarding functions - _forward and _deliver, the difference being that the latter are used when the packets are locally originated. Instead of all this function pointer passing and code duplication, we can just pass a boolean noting that the packet was locally originated and use that to perform the necessary checks in __br_forward. This gives a minor performance improvement but more importantly consolidates the forwarding paths. Also add a kernel doc comment to explain the exported br_forward()'s arguments. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_device.c | 22 ++-- net/bridge/br_forward.c | 184 +++++++++++-------------------- net/bridge/br_input.c | 6 +- net/bridge/br_private.h | 27 ++--- net/bridge/netfilter/nft_reject_bridge.c | 8 +- 5 files changed, 94 insertions(+), 153 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 8eecd0ec22f2..09f26940aba5 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -61,11 +61,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) goto out; - if (is_broadcast_ether_addr(dest)) - br_flood_deliver(br, skb, false); - else if (is_multicast_ether_addr(dest)) { + if (is_broadcast_ether_addr(dest)) { + br_flood(br, skb, false, false, true); + } else if (is_multicast_ether_addr(dest)) { if (unlikely(netpoll_tx_running(dev))) { - br_flood_deliver(br, skb, false); + br_flood(br, skb, false, false, true); goto out; } if (br_multicast_rcv(br, NULL, skb, vid)) { @@ -76,14 +76,14 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(br, eth_hdr(skb))) - br_multicast_deliver(mdst, skb); + br_multicast_flood(mdst, skb, false, true); else - br_flood_deliver(br, skb, false); - } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) - br_deliver(dst->dst, skb); - else - br_flood_deliver(br, skb, true); - + br_flood(br, skb, false, false, true); + } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) { + br_forward(dst->dst, skb, false, true); + } else { + br_flood(br, skb, true, false, true); + } out: rcu_read_unlock(); return NETDEV_TX_OK; diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 204f99304a8a..63a83d8d7da3 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -21,11 +21,6 @@ #include #include "br_private.h" -static int deliver_clone(const struct net_bridge_port *prev, - struct sk_buff *skb, - void (*__packet_hook)(const struct net_bridge_port *p, - struct sk_buff *skb)); - /* Don't forward packets to originating port or forwarding disabled */ static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) @@ -75,106 +70,92 @@ int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(br_forward_finish); -static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) +static void __br_forward(const struct net_bridge_port *to, + struct sk_buff *skb, bool local_orig) { struct net_bridge_vlan_group *vg; + struct net_device *indev; + struct net *net; + int br_hook; vg = nbp_vlan_group_rcu(to); skb = br_handle_vlan(to->br, vg, skb); if (!skb) return; + indev = skb->dev; skb->dev = to->dev; - - if (unlikely(netpoll_tx_running(to->br->dev))) { - if (!is_skb_forwardable(skb->dev, skb)) + if (!local_orig) { + if (skb_warn_if_lro(skb)) { kfree_skb(skb); - else { - skb_push(skb, ETH_HLEN); - br_netpoll_send_skb(to, skb); + return; } - return; + br_hook = NF_BR_FORWARD; + skb_forward_csum(skb); + net = dev_net(indev); + } else { + if (unlikely(netpoll_tx_running(to->br->dev))) { + if (!is_skb_forwardable(skb->dev, skb)) { + kfree_skb(skb); + } else { + skb_push(skb, ETH_HLEN); + br_netpoll_send_skb(to, skb); + } + return; + } + br_hook = NF_BR_LOCAL_OUT; + net = dev_net(skb->dev); + indev = NULL; } - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, - dev_net(skb->dev), NULL, skb,NULL, skb->dev, + NF_HOOK(NFPROTO_BRIDGE, br_hook, + net, NULL, skb, indev, skb->dev, br_forward_finish); } -static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) +static int deliver_clone(const struct net_bridge_port *prev, + struct sk_buff *skb, bool local_orig) { - struct net_bridge_vlan_group *vg; - struct net_device *indev; - - if (skb_warn_if_lro(skb)) { - kfree_skb(skb); - return; - } - - vg = nbp_vlan_group_rcu(to); - skb = br_handle_vlan(to->br, vg, skb); - if (!skb) - return; - - indev = skb->dev; - skb->dev = to->dev; - skb_forward_csum(skb); - - NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, - dev_net(indev), NULL, skb, indev, skb->dev, - br_forward_finish); -} + struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; -/* called with rcu_read_lock */ -void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) -{ - if (to && should_deliver(to, skb)) { - __br_deliver(to, skb); - return; + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) { + dev->stats.tx_dropped++; + return -ENOMEM; } - kfree_skb(skb); + __br_forward(prev, skb, local_orig); + return 0; } -EXPORT_SYMBOL_GPL(br_deliver); -/* called with rcu_read_lock */ -void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, - bool local_rcv) +/** + * br_forward - forward a packet to a specific port + * @to: destination port + * @skb: packet being forwarded + * @local_rcv: packet will be received locally after forwarding + * @local_orig: packet is locally originated + * + * Should be called with rcu_read_lock. + */ +void br_forward(const struct net_bridge_port *to, + struct sk_buff *skb, bool local_rcv, bool local_orig) { if (to && should_deliver(to, skb)) { if (local_rcv) - deliver_clone(to, skb, __br_forward); + deliver_clone(to, skb, local_orig); else - __br_forward(to, skb); + __br_forward(to, skb, local_orig); return; } if (!local_rcv) kfree_skb(skb); } - -static int deliver_clone(const struct net_bridge_port *prev, - struct sk_buff *skb, - void (*__packet_hook)(const struct net_bridge_port *p, - struct sk_buff *skb)) -{ - struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; - - skb = skb_clone(skb, GFP_ATOMIC); - if (!skb) { - dev->stats.tx_dropped++; - return -ENOMEM; - } - - __packet_hook(prev, skb); - return 0; -} +EXPORT_SYMBOL_GPL(br_forward); static struct net_bridge_port *maybe_deliver( struct net_bridge_port *prev, struct net_bridge_port *p, - struct sk_buff *skb, - void (*__packet_hook)(const struct net_bridge_port *p, - struct sk_buff *skb)) + struct sk_buff *skb, bool local_orig) { int err; @@ -184,7 +165,7 @@ static struct net_bridge_port *maybe_deliver( if (!prev) goto out; - err = deliver_clone(prev, skb, __packet_hook); + err = deliver_clone(prev, skb, local_orig); if (err) return ERR_PTR(err); @@ -192,18 +173,14 @@ out: return p; } -/* called under bridge lock */ -static void br_flood(struct net_bridge *br, struct sk_buff *skb, - void (*__packet_hook)(const struct net_bridge_port *p, - struct sk_buff *skb), - bool local_rcv, bool unicast) +/* called under rcu_read_lock */ +void br_flood(struct net_bridge *br, struct sk_buff *skb, + bool unicast, bool local_rcv, bool local_orig) { u8 igmp_type = br_multicast_igmp_type(skb); - struct net_bridge_port *prev; + struct net_bridge_port *prev = NULL; struct net_bridge_port *p; - prev = NULL; - list_for_each_entry_rcu(p, &br->port_list, list) { /* Do not flood unicast traffic to ports that turn it off */ if (unicast && !(p->flags & BR_FLOOD)) @@ -216,7 +193,7 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, BR_INPUT_SKB_CB(skb)->proxyarp_replied) continue; - prev = maybe_deliver(prev, p, skb, __packet_hook); + prev = maybe_deliver(prev, p, skb, local_orig); if (IS_ERR(prev)) goto out; if (prev == p) @@ -228,9 +205,9 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, goto out; if (local_rcv) - deliver_clone(prev, skb, __packet_hook); + deliver_clone(prev, skb, local_orig); else - __packet_hook(prev, skb); + __br_forward(prev, skb, local_orig); return; out: @@ -238,28 +215,11 @@ out: kfree_skb(skb); } - -/* called with rcu_read_lock */ -void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast) -{ - br_flood(br, skb, __br_deliver, false, unicast); -} - -/* called under bridge lock */ -void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, - bool local_rcv, bool unicast) -{ - br_flood(br, skb, __br_forward, local_rcv, unicast); -} - #ifdef CONFIG_BRIDGE_IGMP_SNOOPING /* called with rcu_read_lock */ -static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, - void (*__packet_hook)( - const struct net_bridge_port *p, - struct sk_buff *skb), - bool local_rcv) +void br_multicast_flood(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb, + bool local_rcv, bool local_orig) { struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; u8 igmp_type = br_multicast_igmp_type(skb); @@ -280,7 +240,7 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, port = (unsigned long)lport > (unsigned long)rport ? lport : rport; - prev = maybe_deliver(prev, port, skb, __packet_hook); + prev = maybe_deliver(prev, port, skb, local_orig); if (IS_ERR(prev)) goto out; if (prev == port) @@ -297,27 +257,13 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, goto out; if (local_rcv) - deliver_clone(prev, skb, __packet_hook); + deliver_clone(prev, skb, local_orig); else - __packet_hook(prev, skb); + __br_forward(prev, skb, local_orig); return; out: if (!local_rcv) kfree_skb(skb); } - -/* called with rcu_read_lock */ -void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb) -{ - br_multicast_flood(mdst, skb, __br_deliver, false); -} - -/* called with rcu_read_lock */ -void br_multicast_forward(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, bool local_rcv) -{ - br_multicast_flood(mdst, skb, __br_forward, local_rcv); -} #endif diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index dd8885def11b..8b08eec763a5 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -189,12 +189,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (dst) { dst->used = jiffies; - br_forward(dst->dst, skb, local_rcv); + br_forward(dst->dst, skb, local_rcv, false); } else { if (!mcast_hit) - br_flood_forward(br, skb, local_rcv, unicast); + br_flood(br, skb, unicast, local_rcv, false); else - br_multicast_forward(mdst, skb, local_rcv); + br_multicast_flood(mdst, skb, local_rcv, false); } if (local_rcv) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 4d6cdf459e57..b3088264f844 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -505,14 +505,12 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid); /* br_forward.c */ -void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb); int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb); void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, - bool local_rcv); + bool local_rcv, bool local_orig); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb); -void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast); -void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, - bool local_rcv, bool unicast); +void br_flood(struct net_bridge *br, struct sk_buff *skb, + bool unicast, bool local_rcv, bool local_orig); /* br_if.c */ void br_port_carrier_check(struct net_bridge_port *p); @@ -560,10 +558,8 @@ void br_multicast_init(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); void br_multicast_dev_del(struct net_bridge *br); -void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb); -void br_multicast_forward(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, bool local_rcv); +void br_multicast_flood(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb, bool local_rcv, bool local_orig); int br_multicast_set_router(struct net_bridge *br, unsigned long val); int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val); int br_multicast_toggle(struct net_bridge *br, unsigned long val); @@ -691,28 +687,27 @@ static inline void br_multicast_dev_del(struct net_bridge *br) { } -static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb) +static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb, + bool local_rcv, bool local_orig) { } -static inline void br_multicast_forward(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, - bool local_rcv) -{ -} static inline bool br_multicast_is_router(struct net_bridge *br) { return 0; } + static inline bool br_multicast_querier_exists(struct net_bridge *br, struct ethhdr *eth) { return false; } + static inline void br_mdb_init(void) { } + static inline void br_mdb_uninit(void) { } diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 77f7e7a9ebe1..0b77ffbc27d6 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -72,7 +72,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net, nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(dev), nskb); + br_forward(br_port_get_rcu(dev), nskb, false, true); } static void nft_reject_br_send_v4_unreach(struct net *net, @@ -140,7 +140,7 @@ static void nft_reject_br_send_v4_unreach(struct net *net, nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(dev), nskb); + br_forward(br_port_get_rcu(dev), nskb, false, true); } static void nft_reject_br_send_v6_tcp_reset(struct net *net, @@ -174,7 +174,7 @@ static void nft_reject_br_send_v6_tcp_reset(struct net *net, nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(dev), nskb); + br_forward(br_port_get_rcu(dev), nskb, false, true); } static bool reject6_br_csum_ok(struct sk_buff *skb, int hook) @@ -255,7 +255,7 @@ static void nft_reject_br_send_v6_unreach(struct net *net, nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(dev), nskb); + br_forward(br_port_get_rcu(dev), nskb, false, true); } static void nft_reject_bridge_eval(const struct nft_expr *expr, -- cgit v1.2.3 From 43b9e127406079d187794a5140a2411fbc6df2df Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 14 Jul 2016 19:28:27 +0300 Subject: net: ipmr/ip6mr: add support for keeping an entry age In preparation for hardware offloading of ipmr/ip6mr we need an interface that allows to check (and later update) the age of entries. Relying on stats alone can show activity but not actual age of the entry, furthermore when there're tens of thousands of entries a lot of the hardware implementations only support "hit" bits which are cleared on read to denote that the entry was active and shouldn't be aged out, these can then be naturally translated into age timestamp and will be compatible with the software forwarding age. Using a lastuse entry doesn't affect performance because the members in that cache line are written to along with the age. Since all new users are encouraged to use ipmr via netlink, this is exported via the RTA_EXPIRES attribute. Also do a minor local variable declaration style adjustment - arrange them longest to shortest. Signed-off-by: Nikolay Aleksandrov CC: Roopa Prabhu CC: Shrijeet Mukherjee CC: Satish Ashok CC: Donald Sharp CC: David S. Miller CC: Alexey Kuznetsov CC: James Morris CC: Hideaki YOSHIFUJI CC: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 13 +++++++++---- net/ipv6/ip6mr.c | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5ad48ec77710..e0d76f5f0113 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1150,6 +1150,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, c->mfc_origin = mfc->mfcc_origin.s_addr; c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; c->mfc_parent = mfc->mfcc_parent; + c->mfc_un.res.lastuse = jiffies; ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); if (!mrtsock) c->mfc_flags |= MFC_STATIC; @@ -1792,6 +1793,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, vif = cache->mfc_parent; cache->mfc_un.res.pkt++; cache->mfc_un.res.bytes += skb->len; + cache->mfc_un.res.lastuse = jiffies; if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; @@ -2071,10 +2073,10 @@ drop: static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) { - int ct; - struct rtnexthop *nhp; - struct nlattr *mp_attr; struct rta_mfc_stats mfcs; + struct nlattr *mp_attr; + struct rtnexthop *nhp; + int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ if (c->mfc_parent >= MAXVIFS) @@ -2106,7 +2108,10 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; - if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0) + if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || + nla_put_u64_64bit(skb, RTA_EXPIRES, + jiffies_to_clock_t(c->mfc_un.res.lastuse), + RTA_PAD)) return -EMSGSIZE; rtm->rtm_type = RTN_MULTICAST; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index c7ca0f5d1a3b..7adce139d92a 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1500,6 +1500,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; c->mf6c_parent = mfc->mf6cc_parent; + c->mfc_un.res.lastuse = jiffies; ip6mr_update_thresholds(mrt, c, ttls); if (!mrtsock) c->mfc_flags |= MFC_STATIC; @@ -2092,6 +2093,7 @@ static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, vif = cache->mf6c_parent; cache->mfc_un.res.pkt++; cache->mfc_un.res.bytes += skb->len; + cache->mfc_un.res.lastuse = jiffies; if (ipv6_addr_any(&cache->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; @@ -2234,10 +2236,10 @@ int ip6_mr_input(struct sk_buff *skb) static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm) { - int ct; - struct rtnexthop *nhp; - struct nlattr *mp_attr; struct rta_mfc_stats mfcs; + struct nlattr *mp_attr; + struct rtnexthop *nhp; + int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ if (c->mf6c_parent >= MAXMIFS) @@ -2270,7 +2272,10 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; - if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0) + if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || + nla_put_u64_64bit(skb, RTA_EXPIRES, + jiffies_to_clock_t(c->mfc_un.res.lastuse), + RTA_PAD)) return -EMSGSIZE; rtm->rtm_type = RTN_MULTICAST; -- cgit v1.2.3 From e5b13f3444dfe4f014343e83aa7948b83ef58168 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 15 Jul 2016 16:38:19 -0300 Subject: sctp: recvmsg should be able to run even if sock is in closing state Commit d46e416c11c8 missed to update some other places which checked for the socket being TCP-style AND Established state, as Closing state has some overlapping with the previous understanding of Established. Without this fix, one of the effects is that some already queued rx messages may not be readable anymore depending on how the association teared down, and sending may also not be possible if peer initiated the shutdown. Also merge two if() blocks into one condition on sctp_sendmsg(). Cc: Xin Long Fixes: d46e416c11c8 ("sctp: sctp should change socket state when shutdown is received") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 52fdd540a9ef..d2681cb1dd30 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -202,7 +202,7 @@ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id) * could be a TCP-style listening socket or a socket which * hasn't yet called connect() to establish an association. */ - if (!sctp_sstate(sk, ESTABLISHED)) + if (!sctp_sstate(sk, ESTABLISHED) && !sctp_sstate(sk, CLOSING)) return NULL; /* Get the first and the only association from the list. */ @@ -1068,7 +1068,7 @@ static int __sctp_connect(struct sock *sk, * is already connected. * It cannot be done even on a TCP-style listening socket. */ - if (sctp_sstate(sk, ESTABLISHED) || + if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) || (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) { err = -EISCONN; goto out_free; @@ -1705,18 +1705,19 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) if (msg_name) { /* Look for a matching association on the endpoint. */ asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport); - if (!asoc) { - /* If we could not find a matching association on the - * endpoint, make sure that it is not a TCP-style - * socket that already has an association or there is - * no peeled-off association on another socket. - */ - if ((sctp_style(sk, TCP) && - sctp_sstate(sk, ESTABLISHED)) || - sctp_endpoint_is_peeled_off(ep, &to)) { - err = -EADDRNOTAVAIL; - goto out_unlock; - } + + /* If we could not find a matching association on the + * endpoint, make sure that it is not a TCP-style + * socket that already has an association or there is + * no peeled-off association on another socket. + */ + if (!asoc && + ((sctp_style(sk, TCP) && + (sctp_sstate(sk, ESTABLISHED) || + sctp_sstate(sk, CLOSING))) || + sctp_endpoint_is_peeled_off(ep, &to))) { + err = -EADDRNOTAVAIL; + goto out_unlock; } } else { asoc = sctp_id2assoc(sk, associd); @@ -2077,7 +2078,8 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, lock_sock(sk); - if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED)) { + if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED) && + !sctp_sstate(sk, CLOSING)) { err = -ENOTCONN; goto out; } -- cgit v1.2.3 From c5c4e45c4b79acb23f07e43dac1348e67b4ddf91 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 15 Jul 2016 16:40:02 -0300 Subject: sctp: fix GSO for IPv6 commit 90017accff61 ("sctp: Add GSO support") didn't register SCTP GSO offloading for IPv6 and yet didn't put any restrictions on generating GSO packets while in IPv6, which causes all IPv6 GSO'ed packets to be silently dropped. The fix is to properly register the offload this time. Fixes: 90017accff61 ("sctp: Add GSO support") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/offload.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/offload.c b/net/sctp/offload.c index a37887b373a7..7e869d0cca69 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -92,7 +92,28 @@ static const struct net_offload sctp_offload = { }, }; +static const struct net_offload sctp6_offload = { + .callbacks = { + .gso_segment = sctp_gso_segment, + }, +}; + int __init sctp_offload_init(void) { - return inet_add_offload(&sctp_offload, IPPROTO_SCTP); + int ret; + + ret = inet_add_offload(&sctp_offload, IPPROTO_SCTP); + if (ret) + goto out; + + ret = inet6_add_offload(&sctp6_offload, IPPROTO_SCTP); + if (ret) + goto ipv4; + + return ret; + +ipv4: + inet_del_offload(&sctp_offload, IPPROTO_SCTP); +out: + return ret; } -- cgit v1.2.3 From 23bc6ab0a0912146fd674a0becc758c3162baabc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Thu, 14 Jul 2016 10:50:23 +0200 Subject: Bluetooth: Fix l2cap_sock_setsockopt() with optname BT_RCVMTU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we retrieve imtu value from userspace we should use 16 bit pointer cast instead of 32 as it's defined that way in headers. Fixes setsockopt calls on big-endian platforms. Signed-off-by: Amadeusz Sławiński Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- net/bluetooth/l2cap_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 388ee8b59145..1842141baedb 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -927,7 +927,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (get_user(opt, (u16 __user *) optval)) { err = -EFAULT; break; } -- cgit v1.2.3 From 5177a83827cd0b8cf6ce0391b00dd4417352d2f1 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 17 Jul 2016 19:55:16 +0200 Subject: Bluetooth: Add debugfs fields for hardware and firmware info Some Bluetooth controllers allow for reading hardware and firmware related vendor specific infos. If they are available, then they can be exposed via debugfs now. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 24 ++++++++++++++++++++++++ net/bluetooth/hci_debugfs.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 98f6c3770736..ddf8432fe8fb 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3163,6 +3163,8 @@ void hci_unregister_dev(struct hci_dev *hdev) device_del(&hdev->dev); debugfs_remove_recursive(hdev->debugfs); + kfree_const(hdev->hw_info); + kfree_const(hdev->fw_info); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); @@ -3266,6 +3268,28 @@ int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) } EXPORT_SYMBOL(hci_recv_diag); +void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...) +{ + va_list vargs; + + va_start(vargs, fmt); + kfree_const(hdev->hw_info); + hdev->hw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); + va_end(vargs); +} +EXPORT_SYMBOL(hci_set_hw_info); + +void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...) +{ + va_list vargs; + + va_start(vargs, fmt); + kfree_const(hdev->fw_info); + hdev->fw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); + va_end(vargs); +} +EXPORT_SYMBOL(hci_set_fw_info); + /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 7db4220941cc..63df63ebfb24 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -76,6 +76,30 @@ static const struct file_operations __name ## _fops = { \ .llseek = default_llseek, \ } \ +#define DEFINE_INFO_ATTRIBUTE(__name, __field) \ +static int __name ## _show(struct seq_file *f, void *ptr) \ +{ \ + struct hci_dev *hdev = f->private; \ + \ + hci_dev_lock(hdev); \ + seq_printf(f, "%s\n", hdev->__field ? : ""); \ + hci_dev_unlock(hdev); \ + \ + return 0; \ +} \ + \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, __name ## _show, inode->i_private); \ +} \ + \ +static const struct file_operations __name ## _fops = { \ + .open = __name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} \ + static int features_show(struct seq_file *f, void *ptr) { struct hci_dev *hdev = f->private; @@ -349,6 +373,9 @@ static const struct file_operations sc_only_mode_fops = { .llseek = default_llseek, }; +DEFINE_INFO_ATTRIBUTE(hardware_info, hw_info); +DEFINE_INFO_ATTRIBUTE(firmware_info, fw_info); + void hci_debugfs_create_common(struct hci_dev *hdev) { debugfs_create_file("features", 0444, hdev->debugfs, hdev, @@ -382,6 +409,14 @@ void hci_debugfs_create_common(struct hci_dev *hdev) if (lmp_sc_capable(hdev) || lmp_le_capable(hdev)) debugfs_create_file("sc_only_mode", 0444, hdev->debugfs, hdev, &sc_only_mode_fops); + + if (hdev->hw_info) + debugfs_create_file("hardware_info", 0444, hdev->debugfs, + hdev, &hardware_info_fops); + + if (hdev->fw_info) + debugfs_create_file("firmware_info", 0444, hdev->debugfs, + hdev, &firmware_info_fops); } static int inquiry_cache_show(struct seq_file *f, void *p) -- cgit v1.2.3 From f4dc77713f8016d2e8a3295e1c9c53a21f296def Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 14 Jul 2016 17:51:26 +0200 Subject: netfilter: x_tables: speed up jump target validation The dummy ruleset I used to test the original validation change was broken, most rules were unreachable and were not tested by mark_source_chains(). In some cases rulesets that used to load in a few seconds now require several minutes. sample ruleset that shows the behaviour: echo "*filter" for i in $(seq 0 100000);do printf ":chain_%06x - [0:0]\n" $i done for i in $(seq 0 100000);do printf -- "-A INPUT -j chain_%06x\n" $i printf -- "-A INPUT -j chain_%06x\n" $i printf -- "-A INPUT -j chain_%06x\n" $i done echo COMMIT [ pipe result into iptables-restore ] This ruleset will be about 74mbyte in size, with ~500k searches though all 500k[1] rule entries. iptables-restore will take forever (gave up after 10 minutes) Instead of always searching the entire blob for a match, fill an array with the start offsets of every single ipt_entry struct, then do a binary search to check if the jump target is present or not. After this change ruleset restore times get again close to what one gets when reverting 36472341017529e (~3 seconds on my workstation). [1] every user-defined rule gets an implicit RETURN, so we get 300k jumps + 100k userchains + 100k returns -> 500k rule entries Fixes: 36472341017529e ("netfilter: x_tables: validate targets of jumps") Reported-by: Jeff Wu Tested-by: Jeff Wu Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 47 ++++++++++++++++++++------------------ net/ipv4/netfilter/ip_tables.c | 45 ++++++++++++++++++++----------------- net/ipv6/netfilter/ip6_tables.c | 45 ++++++++++++++++++++----------------- net/netfilter/x_tables.c | 50 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c8dd9e26b185..b31df597fd37 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -299,23 +299,12 @@ static inline bool unconditional(const struct arpt_entry *e) memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } -static bool find_jump_target(const struct xt_table_info *t, - const struct arpt_entry *target) -{ - struct arpt_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -388,10 +377,11 @@ static int mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct arpt_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -543,6 +533,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, const struct arpt_replace *repl) { struct arpt_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -555,6 +546,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ @@ -565,17 +559,20 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - break; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(arpt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } if (ret != 0) - return ret; + goto out_free; + ret = -EINVAL; if (i != repl->num_entries) - return -EINVAL; + goto out_free; /* Check hooks all assigned */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { @@ -583,13 +580,16 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; if (newinfo->underflow[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -609,6 +609,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f0df66f54ce6..f993545a3373 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -373,23 +373,12 @@ ipt_do_table(struct sk_buff *skb, else return verdict; } -static bool find_jump_target(const struct xt_table_info *t, - const struct ipt_entry *target) -{ - struct ipt_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -458,10 +447,11 @@ mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct ipt_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -694,6 +684,7 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ipt_replace *repl) { struct ipt_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -706,6 +697,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -715,15 +709,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - return ret; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ipt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } + ret = -EINVAL; if (i != repl->num_entries) - return -EINVAL; + goto out_free; /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { @@ -731,13 +728,16 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; if (newinfo->underflow[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -757,6 +757,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 61ed95054efa..552fac2f390a 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -402,23 +402,12 @@ ip6t_do_table(struct sk_buff *skb, else return verdict; } -static bool find_jump_target(const struct xt_table_info *t, - const struct ip6t_entry *target) -{ - struct ip6t_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -487,10 +476,11 @@ mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct ip6t_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -724,6 +714,7 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ip6t_replace *repl) { struct ip6t_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -736,6 +727,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -745,15 +739,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - return ret; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ip6t_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } + ret = -EINVAL; if (i != repl->num_entries) - return -EINVAL; + goto out_free; /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { @@ -761,13 +758,16 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; if (newinfo->underflow[i] == 0xFFFFFFFF) - return -EINVAL; + goto out_free; } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -787,6 +787,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index fe0e2db632c7..e0aa7c1d0224 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -702,6 +702,56 @@ int xt_check_entry_offsets(const void *base, } EXPORT_SYMBOL(xt_check_entry_offsets); +/** + * xt_alloc_entry_offsets - allocate array to store rule head offsets + * + * @size: number of entries + * + * Return: NULL or kmalloc'd or vmalloc'd array + */ +unsigned int *xt_alloc_entry_offsets(unsigned int size) +{ + unsigned int *off; + + off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN); + + if (off) + return off; + + if (size < (SIZE_MAX / sizeof(unsigned int))) + off = vmalloc(size * sizeof(unsigned int)); + + return off; +} +EXPORT_SYMBOL(xt_alloc_entry_offsets); + +/** + * xt_find_jump_offset - check if target is a valid jump offset + * + * @offsets: array containing all valid rule start offsets of a rule blob + * @target: the jump target to search for + * @size: entries in @offset + */ +bool xt_find_jump_offset(const unsigned int *offsets, + unsigned int target, unsigned int size) +{ + int m, low = 0, hi = size; + + while (hi > low) { + m = (low + hi) / 2u; + + if (offsets[m] > target) + hi = m; + else if (offsets[m] < target) + low = m + 1; + else + return true; + } + + return false; +} +EXPORT_SYMBOL(xt_find_jump_offset); + int xt_check_target(struct xt_tgchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { -- cgit v1.2.3 From 590025a27fe0603e855a054c4ad57d966bd8af07 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 16 Jul 2016 14:27:21 +0800 Subject: netfilter: nft_ct: fix unpaired nf_connlabels_get/put call We only get nf_connlabels if the user add ct label set expr successfully, but we will also put nf_connlabels if the user delete ct lable get expr. This is mismathced, and will cause ct label expr cannot work properly. Also, if we init something fail, we should put nf_connlabels back. Otherwise, we may waste to alloc the memory that will never be used. Signed-off-by: Liping Zhang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 7ce8fd7ace78..d9e44ca34055 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -366,6 +366,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); + bool label_got = false; unsigned int len; int err; @@ -384,6 +385,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); if (err) return err; + label_got = true; break; #endif default: @@ -393,17 +395,28 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]); err = nft_validate_register_load(priv->sreg, len); if (err < 0) - return err; + goto err1; err = nft_ct_l3proto_try_module_get(ctx->afi->family); if (err < 0) - return err; + goto err1; return 0; + +err1: + if (label_got) + nf_connlabels_put(ctx->net); + return err; +} + +static void nft_ct_get_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nft_ct_l3proto_module_put(ctx->afi->family); } -static void nft_ct_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +static void nft_ct_set_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { struct nft_ct *priv = nft_expr_priv(expr); @@ -475,7 +488,7 @@ static const struct nft_expr_ops nft_ct_get_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_eval, .init = nft_ct_get_init, - .destroy = nft_ct_destroy, + .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, }; @@ -484,7 +497,7 @@ static const struct nft_expr_ops nft_ct_set_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_eval, .init = nft_ct_set_init, - .destroy = nft_ct_destroy, + .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, }; -- cgit v1.2.3 From 359ebda25aa06fe3a1d028f7e338a849165e661b Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 18 Jul 2016 14:49:33 +0300 Subject: net/ipv4: Introduce IPSKB_FRAG_SEGS bit to inet_skb_parm.flags This flag indicates whether fragmentation of segments is allowed. Formerly this policy was hardcoded according to IPSKB_FORWARDED (set by either ip_forward or ipmr_forward). Cc: Hannes Frederic Sowa Cc: Florian Westphal Signed-off-by: Shmulik Ladkani Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 6 ++++-- net/ipv4/ipmr.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 9f0a7b96646f..8b4ffd216839 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -117,7 +117,7 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; - IPCB(skb)->flags |= IPSKB_FORWARDED; + IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e23f141c9ba5..dde37fb340bf 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -223,8 +223,10 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *segs; int ret = 0; - /* common case: locally created skb or seglen is <= mtu */ - if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || + /* common case: fragmentation of segments is not allowed, + * or seglen is <= mtu + */ + if (((IPCB(skb)->flags & IPSKB_FRAG_SEGS) == 0) || skb_gso_validate_mtu(skb, mtu)) return ip_finish_output2(net, sk, skb); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e0d76f5f0113..eec234161b89 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1749,7 +1749,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->dev->stats.tx_bytes += skb->len; } - IPCB(skb)->flags |= IPSKB_FORWARDED; + IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output -- cgit v1.2.3 From b8247f095eddfbfdba0fcecd1e3525a6cdb4b585 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 18 Jul 2016 14:49:34 +0300 Subject: net: ip_finish_output_gso: If skb_gso_network_seglen exceeds MTU, allow segmentation for local udp tunneled skbs Given: - tap0 and vxlan0 are bridged - vxlan0 stacked on eth0, eth0 having small mtu (e.g. 1400) Assume GSO skbs arriving from tap0 having a gso_size as determined by user-provided virtio_net_hdr (e.g. 1460 corresponding to VM mtu of 1500). After encapsulation these skbs have skb_gso_network_seglen that exceed eth0's ip_skb_dst_mtu. These skbs are accidentally passed to ip_finish_output2 AS IS. Alas, each final segment (segmented either by validate_xmit_skb or by hardware UFO) would be larger than eth0 mtu. As a result, those above-mtu segments get dropped on certain networks. This behavior is not aligned with the NON-GSO case: Assume a non-gso 1500-sized IP packet arrives from tap0. After encapsulation, the vxlan datagram is fragmented normally at the ip_finish_output-->ip_fragment code path. The expected behavior for the GSO case would be segmenting the "gso-oversized" skb first, then fragmenting each segment according to dst mtu, and finally passing the resulting fragments to ip_finish_output2. 'ip_finish_output_gso' already supports this "Slowpath" behavior, according to the IPSKB_FRAG_SEGS flag, which is only set during ipv4 forwarding (not set in the bridged case). In order to support the bridged case, we'll mark skbs arriving from an ingress interface that get udp-encaspulated as "allowed to be fragmented", causing their network_seglen to be validated by 'ip_finish_output_gso' (and fragment if needed). Note the TUNNEL_DONT_FRAGMENT tun_flag is still honoured (both in the gso and non-gso cases), which serves users wishing to forbid fragmentation at the udp tunnel endpoint. Cc: Hannes Frederic Sowa Cc: Florian Westphal Signed-off-by: Shmulik Ladkani Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index afd6b5968caf..9d847c302551 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -63,6 +63,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); struct net_device *dev = skb->dev; + int skb_iif = skb->skb_iif; struct iphdr *iph; int err; @@ -72,6 +73,14 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + if (skb_iif && proto == IPPROTO_UDP) { + /* Arrived from an ingress interface and got udp encapuslated. + * The encapsulated network segment length may exceed dst mtu. + * Allow IP Fragmentation of segments. + */ + IPCB(skb)->flags |= IPSKB_FRAG_SEGS; + } + /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); -- cgit v1.2.3 From 34a79f63bbe49c888f95e75dd759685a238556b6 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 18 Jul 2016 20:45:38 -0400 Subject: net: dsa: support switchdev ageing time attr Add a new function for DSA drivers to handle the switchdev SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME attribute. The ageing time is passed as milliseconds. Also because we can have multiple logical bridges on top of a physical switch and ageing time are switch-wide, call the driver function with the fastest ageing time in use on the chip instead of the requested one. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/slave.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 7236eb26dc97..fc9196745225 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -333,6 +333,44 @@ static int dsa_slave_vlan_filtering(struct net_device *dev, return 0; } +static int dsa_fastest_ageing_time(struct dsa_switch *ds, + unsigned int ageing_time) +{ + int i; + + for (i = 0; i < DSA_MAX_PORTS; ++i) { + struct dsa_port *dp = &ds->ports[i]; + + if (dp && dp->ageing_time && dp->ageing_time < ageing_time) + ageing_time = dp->ageing_time; + } + + return ageing_time; +} + +static int dsa_slave_ageing_time(struct net_device *dev, + const struct switchdev_attr *attr, + struct switchdev_trans *trans) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time); + unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); + + /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ + if (switchdev_trans_ph_prepare(trans)) + return 0; + + /* Keep the fastest ageing time in case of multiple bridges */ + ds->ports[p->port].ageing_time = ageing_time; + ageing_time = dsa_fastest_ageing_time(ds, ageing_time); + + if (ds->drv->set_ageing_time) + return ds->drv->set_ageing_time(ds, ageing_time); + + return 0; +} + static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) @@ -346,6 +384,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev, case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: ret = dsa_slave_vlan_filtering(dev, attr, trans); break; + case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: + ret = dsa_slave_ageing_time(dev, attr, trans); + break; default: ret = -EOPNOTSUPP; break; -- cgit v1.2.3 From 2d283bdd079c0ad4da020bbc9e9c2a4280823098 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 19 Jul 2016 11:54:16 +1000 Subject: net/ncsi: Resource management NCSI spec (DSP0222) defines several objects: package, channel, mode, filter, version and statistics etc. This introduces the data structs to represent those objects and implement functions to manage them. Also, this introduces CONFIG_NET_NCSI for the newly implemented NCSI stack. * The user (e.g. netdev driver) dereference NCSI device by "struct ncsi_dev", which is embedded to "struct ncsi_dev_priv". The later one is used by NCSI stack internally. * Every NCSI device can have multiple packages simultaneously, up to 8 packages. It's represented by "struct ncsi_package" and identified by 3-bits ID. * Every NCSI package can have multiple channels, up to 32. It's represented by "struct ncsi_channel" and identified by 5-bits ID. * Every NCSI channel has version, statistics, various modes and filters. They are represented by "struct ncsi_channel_version", "struct ncsi_channel_stats", "struct ncsi_channel_mode" and "struct ncsi_channel_filter" separately. * Apart from AEN (Asynchronous Event Notification), the NCSI stack works in terms of command and response. This introduces "struct ncsi_req" to represent a complete NCSI transaction made of NCSI request and response. link: https://www.dmtf.org/sites/default/files/standards/documents/DSP0222_1.1.0.pdf Signed-off-by: Gavin Shan Acked-by: Joel Stanley Signed-off-by: David S. Miller --- net/Kconfig | 1 + net/Makefile | 1 + net/ncsi/Kconfig | 12 ++ net/ncsi/Makefile | 4 + net/ncsi/internal.h | 256 +++++++++++++++++++++++++++++ net/ncsi/ncsi-manage.c | 436 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 710 insertions(+) create mode 100644 net/ncsi/Kconfig create mode 100644 net/ncsi/Makefile create mode 100644 net/ncsi/internal.h create mode 100644 net/ncsi/ncsi-manage.c (limited to 'net') diff --git a/net/Kconfig b/net/Kconfig index ff40562a782c..c2cdbce629bd 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -237,6 +237,7 @@ source "net/hsr/Kconfig" source "net/switchdev/Kconfig" source "net/l3mdev/Kconfig" source "net/qrtr/Kconfig" +source "net/ncsi/Kconfig" config RPS bool diff --git a/net/Makefile b/net/Makefile index bdd14553a774..9bd20bb86cc6 100644 --- a/net/Makefile +++ b/net/Makefile @@ -79,3 +79,4 @@ ifneq ($(CONFIG_NET_L3_MASTER_DEV),) obj-y += l3mdev/ endif obj-$(CONFIG_QRTR) += qrtr/ +obj-$(CONFIG_NET_NCSI) += ncsi/ diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig new file mode 100644 index 000000000000..08a8a6031fd7 --- /dev/null +++ b/net/ncsi/Kconfig @@ -0,0 +1,12 @@ +# +# Configuration for NCSI support +# + +config NET_NCSI + bool "NCSI interface support" + depends on INET + ---help--- + This module provides NCSI (Network Controller Sideband Interface) + support. Enable this only if your system connects to a network + device via NCSI and the ethernet driver you're using supports + the protocol explicitly. diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile new file mode 100644 index 000000000000..07b5625155d7 --- /dev/null +++ b/net/ncsi/Makefile @@ -0,0 +1,4 @@ +# +# Makefile for NCSI API +# +obj-$(CONFIG_NET_NCSI) += ncsi-manage.o diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h new file mode 100644 index 000000000000..89028e1f83cd --- /dev/null +++ b/net/ncsi/internal.h @@ -0,0 +1,256 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __NCSI_INTERNAL_H__ +#define __NCSI_INTERNAL_H__ + +enum { + NCSI_CAP_BASE = 0, + NCSI_CAP_GENERIC = 0, + NCSI_CAP_BC, + NCSI_CAP_MC, + NCSI_CAP_BUFFER, + NCSI_CAP_AEN, + NCSI_CAP_VLAN, + NCSI_CAP_MAX +}; + +enum { + NCSI_CAP_GENERIC_HWA = 0x01, /* HW arbitration */ + NCSI_CAP_GENERIC_HDS = 0x02, /* HNC driver status change */ + NCSI_CAP_GENERIC_FC = 0x04, /* HNC to MC flow control */ + NCSI_CAP_GENERIC_FC1 = 0x08, /* MC to HNC flow control */ + NCSI_CAP_GENERIC_MC = 0x10, /* Global MC filtering */ + NCSI_CAP_GENERIC_HWA_UNKNOWN = 0x00, /* Unknown HW arbitration */ + NCSI_CAP_GENERIC_HWA_SUPPORT = 0x20, /* Supported HW arbitration */ + NCSI_CAP_GENERIC_HWA_NOT_SUPPORT = 0x40, /* No HW arbitration */ + NCSI_CAP_GENERIC_HWA_RESERVED = 0x60, /* Reserved HW arbitration */ + NCSI_CAP_GENERIC_HWA_MASK = 0x60, /* Mask for HW arbitration */ + NCSI_CAP_GENERIC_MASK = 0x7f, + NCSI_CAP_BC_ARP = 0x01, /* ARP packet filtering */ + NCSI_CAP_BC_DHCPC = 0x02, /* DHCP client filtering */ + NCSI_CAP_BC_DHCPS = 0x04, /* DHCP server filtering */ + NCSI_CAP_BC_NETBIOS = 0x08, /* NetBIOS packet filtering */ + NCSI_CAP_BC_MASK = 0x0f, + NCSI_CAP_MC_IPV6_NEIGHBOR = 0x01, /* IPv6 neighbor filtering */ + NCSI_CAP_MC_IPV6_ROUTER = 0x02, /* IPv6 router filering */ + NCSI_CAP_MC_DHCPV6_RELAY = 0x04, /* DHCPv6 relay / server MC */ + NCSI_CAP_MC_DHCPV6_WELL_KNOWN = 0x08, /* DHCPv6 well-known MC */ + NCSI_CAP_MC_IPV6_MLD = 0x10, /* IPv6 MLD filtering */ + NCSI_CAP_MC_IPV6_NEIGHBOR_S = 0x20, /* IPv6 neighbour filtering */ + NCSI_CAP_MC_MASK = 0x3f, + NCSI_CAP_AEN_LSC = 0x01, /* Link status change */ + NCSI_CAP_AEN_CR = 0x02, /* Configuration required */ + NCSI_CAP_AEN_HDS = 0x04, /* HNC driver status */ + NCSI_CAP_AEN_MASK = 0x07, + NCSI_CAP_VLAN_ONLY = 0x01, /* Filter VLAN packet only */ + NCSI_CAP_VLAN_NO = 0x02, /* Filter VLAN and non-VLAN */ + NCSI_CAP_VLAN_ANY = 0x04, /* Filter Any-and-non-VLAN */ + NCSI_CAP_VLAN_MASK = 0x07 +}; + +enum { + NCSI_MODE_BASE = 0, + NCSI_MODE_ENABLE = 0, + NCSI_MODE_TX_ENABLE, + NCSI_MODE_LINK, + NCSI_MODE_VLAN, + NCSI_MODE_BC, + NCSI_MODE_MC, + NCSI_MODE_AEN, + NCSI_MODE_FC, + NCSI_MODE_MAX +}; + +enum { + NCSI_FILTER_BASE = 0, + NCSI_FILTER_VLAN = 0, + NCSI_FILTER_UC, + NCSI_FILTER_MC, + NCSI_FILTER_MIXED, + NCSI_FILTER_MAX +}; + +struct ncsi_channel_version { + u32 version; /* Supported BCD encoded NCSI version */ + u32 alpha2; /* Supported BCD encoded NCSI version */ + u8 fw_name[12]; /* Firware name string */ + u32 fw_version; /* Firmware version */ + u16 pci_ids[4]; /* PCI identification */ + u32 mf_id; /* Manufacture ID */ +}; + +struct ncsi_channel_cap { + u32 index; /* Index of channel capabilities */ + u32 cap; /* NCSI channel capability */ +}; + +struct ncsi_channel_mode { + u32 index; /* Index of channel modes */ + u32 enable; /* Enabled or disabled */ + u32 size; /* Valid entries in ncm_data[] */ + u32 data[8]; /* Data entries */ +}; + +struct ncsi_channel_filter { + u32 index; /* Index of channel filters */ + u32 total; /* Total entries in the filter table */ + u64 bitmap; /* Bitmap of valid entries */ + u32 data[]; /* Data for the valid entries */ +}; + +struct ncsi_channel_stats { + u32 hnc_cnt_hi; /* Counter cleared */ + u32 hnc_cnt_lo; /* Counter cleared */ + u32 hnc_rx_bytes; /* Rx bytes */ + u32 hnc_tx_bytes; /* Tx bytes */ + u32 hnc_rx_uc_pkts; /* Rx UC packets */ + u32 hnc_rx_mc_pkts; /* Rx MC packets */ + u32 hnc_rx_bc_pkts; /* Rx BC packets */ + u32 hnc_tx_uc_pkts; /* Tx UC packets */ + u32 hnc_tx_mc_pkts; /* Tx MC packets */ + u32 hnc_tx_bc_pkts; /* Tx BC packets */ + u32 hnc_fcs_err; /* FCS errors */ + u32 hnc_align_err; /* Alignment errors */ + u32 hnc_false_carrier; /* False carrier detection */ + u32 hnc_runt_pkts; /* Rx runt packets */ + u32 hnc_jabber_pkts; /* Rx jabber packets */ + u32 hnc_rx_pause_xon; /* Rx pause XON frames */ + u32 hnc_rx_pause_xoff; /* Rx XOFF frames */ + u32 hnc_tx_pause_xon; /* Tx XON frames */ + u32 hnc_tx_pause_xoff; /* Tx XOFF frames */ + u32 hnc_tx_s_collision; /* Single collision frames */ + u32 hnc_tx_m_collision; /* Multiple collision frames */ + u32 hnc_l_collision; /* Late collision frames */ + u32 hnc_e_collision; /* Excessive collision frames */ + u32 hnc_rx_ctl_frames; /* Rx control frames */ + u32 hnc_rx_64_frames; /* Rx 64-bytes frames */ + u32 hnc_rx_127_frames; /* Rx 65-127 bytes frames */ + u32 hnc_rx_255_frames; /* Rx 128-255 bytes frames */ + u32 hnc_rx_511_frames; /* Rx 256-511 bytes frames */ + u32 hnc_rx_1023_frames; /* Rx 512-1023 bytes frames */ + u32 hnc_rx_1522_frames; /* Rx 1024-1522 bytes frames */ + u32 hnc_rx_9022_frames; /* Rx 1523-9022 bytes frames */ + u32 hnc_tx_64_frames; /* Tx 64-bytes frames */ + u32 hnc_tx_127_frames; /* Tx 65-127 bytes frames */ + u32 hnc_tx_255_frames; /* Tx 128-255 bytes frames */ + u32 hnc_tx_511_frames; /* Tx 256-511 bytes frames */ + u32 hnc_tx_1023_frames; /* Tx 512-1023 bytes frames */ + u32 hnc_tx_1522_frames; /* Tx 1024-1522 bytes frames */ + u32 hnc_tx_9022_frames; /* Tx 1523-9022 bytes frames */ + u32 hnc_rx_valid_bytes; /* Rx valid bytes */ + u32 hnc_rx_runt_pkts; /* Rx error runt packets */ + u32 hnc_rx_jabber_pkts; /* Rx error jabber packets */ + u32 ncsi_rx_cmds; /* Rx NCSI commands */ + u32 ncsi_dropped_cmds; /* Dropped commands */ + u32 ncsi_cmd_type_errs; /* Command type errors */ + u32 ncsi_cmd_csum_errs; /* Command checksum errors */ + u32 ncsi_rx_pkts; /* Rx NCSI packets */ + u32 ncsi_tx_pkts; /* Tx NCSI packets */ + u32 ncsi_tx_aen_pkts; /* Tx AEN packets */ + u32 pt_tx_pkts; /* Tx packets */ + u32 pt_tx_dropped; /* Tx dropped packets */ + u32 pt_tx_channel_err; /* Tx channel errors */ + u32 pt_tx_us_err; /* Tx undersize errors */ + u32 pt_rx_pkts; /* Rx packets */ + u32 pt_rx_dropped; /* Rx dropped packets */ + u32 pt_rx_channel_err; /* Rx channel errors */ + u32 pt_rx_us_err; /* Rx undersize errors */ + u32 pt_rx_os_err; /* Rx oversize errors */ +}; + +struct ncsi_dev_priv; +struct ncsi_package; + +#define NCSI_PACKAGE_SHIFT 5 +#define NCSI_PACKAGE_INDEX(c) (((c) >> NCSI_PACKAGE_SHIFT) & 0x7) +#define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1)) +#define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c)) + +struct ncsi_channel { + unsigned char id; + int state; +#define NCSI_CHANNEL_INACTIVE 1 +#define NCSI_CHANNEL_ACTIVE 2 + spinlock_t lock; /* Protect filters etc */ + struct ncsi_package *package; + struct ncsi_channel_version version; + struct ncsi_channel_cap caps[NCSI_CAP_MAX]; + struct ncsi_channel_mode modes[NCSI_MODE_MAX]; + struct ncsi_channel_filter *filters[NCSI_FILTER_MAX]; + struct ncsi_channel_stats stats; + struct list_head node; +}; + +struct ncsi_package { + unsigned char id; /* NCSI 3-bits package ID */ + unsigned char uuid[16]; /* UUID */ + struct ncsi_dev_priv *ndp; /* NCSI device */ + spinlock_t lock; /* Protect the package */ + unsigned int channel_num; /* Number of channels */ + struct list_head channels; /* List of chanels */ + struct list_head node; /* Form list of packages */ +}; + +struct ncsi_request { + unsigned char id; /* Request ID - 0 to 255 */ + bool used; /* Request that has been assigned */ + bool driven; /* Drive state machine */ + struct ncsi_dev_priv *ndp; /* Associated NCSI device */ + struct sk_buff *cmd; /* Associated NCSI command packet */ + struct sk_buff *rsp; /* Associated NCSI response packet */ + struct timer_list timer; /* Timer on waiting for response */ + bool enabled; /* Time has been enabled or not */ +}; + +struct ncsi_dev_priv { + struct ncsi_dev ndev; /* Associated NCSI device */ + unsigned int flags; /* NCSI device flags */ + spinlock_t lock; /* Protect the NCSI device */ + unsigned int package_num; /* Number of packages */ + struct list_head packages; /* List of packages */ + struct ncsi_request requests[256]; /* Request table */ + unsigned int request_id; /* Last used request ID */ + struct list_head node; /* Form NCSI device list */ +}; + +extern struct list_head ncsi_dev_list; +extern spinlock_t ncsi_dev_lock; + +#define TO_NCSI_DEV_PRIV(nd) \ + container_of(nd, struct ncsi_dev_priv, ndev) +#define NCSI_FOR_EACH_DEV(ndp) \ + list_for_each_entry_rcu(ndp, &ncsi_dev_list, node) +#define NCSI_FOR_EACH_PACKAGE(ndp, np) \ + list_for_each_entry_rcu(np, &ndp->packages, node) +#define NCSI_FOR_EACH_CHANNEL(np, nc) \ + list_for_each_entry_rcu(nc, &np->channels, node) + +/* Resources */ +int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data); +int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data); +int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index); +struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, + unsigned char id); +struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, + unsigned char id); +struct ncsi_package *ncsi_find_package(struct ncsi_dev_priv *ndp, + unsigned char id); +struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp, + unsigned char id); +void ncsi_remove_package(struct ncsi_package *np); +void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, + unsigned char id, + struct ncsi_package **np, + struct ncsi_channel **nc); +struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven); +void ncsi_free_request(struct ncsi_request *nr); +struct ncsi_dev *ncsi_find_dev(struct net_device *dev); + +#endif /* __NCSI_INTERNAL_H__ */ diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c new file mode 100644 index 000000000000..0e28ed8f2703 --- /dev/null +++ b/net/ncsi/ncsi-manage.c @@ -0,0 +1,436 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" + +LIST_HEAD(ncsi_dev_list); +DEFINE_SPINLOCK(ncsi_dev_lock); + +static inline int ncsi_filter_size(int table) +{ + int sizes[] = { 2, 6, 6, 6 }; + + BUILD_BUG_ON(ARRAY_SIZE(sizes) != NCSI_FILTER_MAX); + if (table < NCSI_FILTER_BASE || table >= NCSI_FILTER_MAX) + return -EINVAL; + + return sizes[table]; +} + +int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data) +{ + struct ncsi_channel_filter *ncf; + void *bitmap; + int index, size; + unsigned long flags; + + ncf = nc->filters[table]; + if (!ncf) + return -ENXIO; + + size = ncsi_filter_size(table); + if (size < 0) + return size; + + spin_lock_irqsave(&nc->lock, flags); + bitmap = (void *)&ncf->bitmap; + index = -1; + while ((index = find_next_bit(bitmap, ncf->total, index + 1)) + < ncf->total) { + if (!memcmp(ncf->data + size * index, data, size)) { + spin_unlock_irqrestore(&nc->lock, flags); + return index; + } + } + spin_unlock_irqrestore(&nc->lock, flags); + + return -ENOENT; +} + +int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data) +{ + struct ncsi_channel_filter *ncf; + int index, size; + void *bitmap; + unsigned long flags; + + size = ncsi_filter_size(table); + if (size < 0) + return size; + + index = ncsi_find_filter(nc, table, data); + if (index >= 0) + return index; + + ncf = nc->filters[table]; + if (!ncf) + return -ENODEV; + + spin_lock_irqsave(&nc->lock, flags); + bitmap = (void *)&ncf->bitmap; + do { + index = find_next_zero_bit(bitmap, ncf->total, 0); + if (index >= ncf->total) { + spin_unlock_irqrestore(&nc->lock, flags); + return -ENOSPC; + } + } while (test_and_set_bit(index, bitmap)); + + memcpy(ncf->data + size * index, data, size); + spin_unlock_irqrestore(&nc->lock, flags); + + return index; +} + +int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index) +{ + struct ncsi_channel_filter *ncf; + int size; + void *bitmap; + unsigned long flags; + + size = ncsi_filter_size(table); + if (size < 0) + return size; + + ncf = nc->filters[table]; + if (!ncf || index >= ncf->total) + return -ENODEV; + + spin_lock_irqsave(&nc->lock, flags); + bitmap = (void *)&ncf->bitmap; + if (test_and_clear_bit(index, bitmap)) + memset(ncf->data + size * index, 0, size); + spin_unlock_irqrestore(&nc->lock, flags); + + return 0; +} + +struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, + unsigned char id) +{ + struct ncsi_channel *nc; + + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (nc->id == id) + return nc; + } + + return NULL; +} + +struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id) +{ + struct ncsi_channel *nc, *tmp; + int index; + unsigned long flags; + + nc = kzalloc(sizeof(*nc), GFP_ATOMIC); + if (!nc) + return NULL; + + nc->id = id; + nc->package = np; + nc->state = NCSI_CHANNEL_INACTIVE; + spin_lock_init(&nc->lock); + for (index = 0; index < NCSI_CAP_MAX; index++) + nc->caps[index].index = index; + for (index = 0; index < NCSI_MODE_MAX; index++) + nc->modes[index].index = index; + + spin_lock_irqsave(&np->lock, flags); + tmp = ncsi_find_channel(np, id); + if (tmp) { + spin_unlock_irqrestore(&np->lock, flags); + kfree(nc); + return tmp; + } + + list_add_tail_rcu(&nc->node, &np->channels); + np->channel_num++; + spin_unlock_irqrestore(&np->lock, flags); + + return nc; +} + +static void ncsi_remove_channel(struct ncsi_channel *nc) +{ + struct ncsi_package *np = nc->package; + struct ncsi_channel_filter *ncf; + unsigned long flags; + int i; + + /* Release filters */ + spin_lock_irqsave(&nc->lock, flags); + for (i = 0; i < NCSI_FILTER_MAX; i++) { + ncf = nc->filters[i]; + if (!ncf) + continue; + + nc->filters[i] = NULL; + kfree(ncf); + } + + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); + + /* Remove and free channel */ + spin_lock_irqsave(&np->lock, flags); + list_del_rcu(&nc->node); + np->channel_num--; + spin_unlock_irqrestore(&np->lock, flags); + + kfree(nc); +} + +struct ncsi_package *ncsi_find_package(struct ncsi_dev_priv *ndp, + unsigned char id) +{ + struct ncsi_package *np; + + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (np->id == id) + return np; + } + + return NULL; +} + +struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp, + unsigned char id) +{ + struct ncsi_package *np, *tmp; + unsigned long flags; + + np = kzalloc(sizeof(*np), GFP_ATOMIC); + if (!np) + return NULL; + + np->id = id; + np->ndp = ndp; + spin_lock_init(&np->lock); + INIT_LIST_HEAD(&np->channels); + + spin_lock_irqsave(&ndp->lock, flags); + tmp = ncsi_find_package(ndp, id); + if (tmp) { + spin_unlock_irqrestore(&ndp->lock, flags); + kfree(np); + return tmp; + } + + list_add_tail_rcu(&np->node, &ndp->packages); + ndp->package_num++; + spin_unlock_irqrestore(&ndp->lock, flags); + + return np; +} + +void ncsi_remove_package(struct ncsi_package *np) +{ + struct ncsi_dev_priv *ndp = np->ndp; + struct ncsi_channel *nc, *tmp; + unsigned long flags; + + /* Release all child channels */ + list_for_each_entry_safe(nc, tmp, &np->channels, node) + ncsi_remove_channel(nc); + + /* Remove and free package */ + spin_lock_irqsave(&ndp->lock, flags); + list_del_rcu(&np->node); + ndp->package_num--; + spin_unlock_irqrestore(&ndp->lock, flags); + + kfree(np); +} + +void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, + unsigned char id, + struct ncsi_package **np, + struct ncsi_channel **nc) +{ + struct ncsi_package *p; + struct ncsi_channel *c; + + p = ncsi_find_package(ndp, NCSI_PACKAGE_INDEX(id)); + c = p ? ncsi_find_channel(p, NCSI_CHANNEL_INDEX(id)) : NULL; + + if (np) + *np = p; + if (nc) + *nc = c; +} + +/* For two consecutive NCSI commands, the packet IDs shouldn't + * be same. Otherwise, the bogus response might be replied. So + * the available IDs are allocated in round-robin fashion. + */ +struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven) +{ + struct ncsi_request *nr = NULL; + int i, limit = ARRAY_SIZE(ndp->requests); + unsigned long flags; + + /* Check if there is one available request until the ceiling */ + spin_lock_irqsave(&ndp->lock, flags); + for (i = ndp->request_id; !nr && i < limit; i++) { + if (ndp->requests[i].used) + continue; + + nr = &ndp->requests[i]; + nr->used = true; + nr->driven = driven; + if (++ndp->request_id >= limit) + ndp->request_id = 0; + } + + /* Fail back to check from the starting cursor */ + for (i = 0; !nr && i < ndp->request_id; i++) { + if (ndp->requests[i].used) + continue; + + nr = &ndp->requests[i]; + nr->used = true; + nr->driven = driven; + if (++ndp->request_id >= limit) + ndp->request_id = 0; + } + spin_unlock_irqrestore(&ndp->lock, flags); + + return nr; +} + +void ncsi_free_request(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct sk_buff *cmd, *rsp; + unsigned long flags; + + if (nr->enabled) { + nr->enabled = false; + del_timer_sync(&nr->timer); + } + + spin_lock_irqsave(&ndp->lock, flags); + cmd = nr->cmd; + rsp = nr->rsp; + nr->cmd = NULL; + nr->rsp = NULL; + nr->used = false; + spin_unlock_irqrestore(&ndp->lock, flags); + + /* Release command and response */ + consume_skb(cmd); + consume_skb(rsp); +} + +struct ncsi_dev *ncsi_find_dev(struct net_device *dev) +{ + struct ncsi_dev_priv *ndp; + + NCSI_FOR_EACH_DEV(ndp) { + if (ndp->ndev.dev == dev) + return &ndp->ndev; + } + + return NULL; +} + +static void ncsi_request_timeout(unsigned long data) +{ + struct ncsi_request *nr = (struct ncsi_request *)data; + struct ncsi_dev_priv *ndp = nr->ndp; + unsigned long flags; + + /* If the request already had associated response, + * let the response handler to release it. + */ + spin_lock_irqsave(&ndp->lock, flags); + nr->enabled = false; + if (nr->rsp || !nr->cmd) { + spin_unlock_irqrestore(&ndp->lock, flags); + return; + } + spin_unlock_irqrestore(&ndp->lock, flags); + + /* Release the request */ + ncsi_free_request(nr); +} + +struct ncsi_dev *ncsi_register_dev(struct net_device *dev, + void (*handler)(struct ncsi_dev *ndev)) +{ + struct ncsi_dev_priv *ndp; + struct ncsi_dev *nd; + unsigned long flags; + int i; + + /* Check if the device has been registered or not */ + nd = ncsi_find_dev(dev); + if (nd) + return nd; + + /* Create NCSI device */ + ndp = kzalloc(sizeof(*ndp), GFP_ATOMIC); + if (!ndp) + return NULL; + + nd = &ndp->ndev; + nd->state = ncsi_dev_state_registered; + nd->dev = dev; + nd->handler = handler; + + /* Initialize private NCSI device */ + spin_lock_init(&ndp->lock); + INIT_LIST_HEAD(&ndp->packages); + ndp->request_id = 0; + for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) { + ndp->requests[i].id = i; + ndp->requests[i].ndp = ndp; + setup_timer(&ndp->requests[i].timer, + ncsi_request_timeout, + (unsigned long)&ndp->requests[i]); + } + + spin_lock_irqsave(&ncsi_dev_lock, flags); + list_add_tail_rcu(&ndp->node, &ncsi_dev_list); + spin_unlock_irqrestore(&ncsi_dev_lock, flags); + + return nd; +} +EXPORT_SYMBOL_GPL(ncsi_register_dev); + +void ncsi_unregister_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_package *np, *tmp; + unsigned long flags; + + list_for_each_entry_safe(np, tmp, &ndp->packages, node) + ncsi_remove_package(np); + + spin_lock_irqsave(&ncsi_dev_lock, flags); + list_del_rcu(&ndp->node); + spin_unlock_irqrestore(&ncsi_dev_lock, flags); + + kfree(ndp); +} +EXPORT_SYMBOL_GPL(ncsi_unregister_dev); -- cgit v1.2.3 From 6389eaa7fa9c3ee6c7d39f6087b86660d17236ac Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 19 Jul 2016 11:54:17 +1000 Subject: net/ncsi: NCSI command packet handler The NCSI command packets are sent from MC (Management Controller) to remote end. They are used for multiple purposes: probe existing NCSI package/channel, retrieve NCSI channel's capability, configure NCSI channel etc. This defines struct to represent NCSI command packets and introduces function ncsi_xmit_cmd(), which will be used to transmit NCSI command packet according to the request. The request is represented by struct ncsi_cmd_arg. Signed-off-by: Gavin Shan Acked-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/Makefile | 2 +- net/ncsi/internal.h | 19 +++ net/ncsi/ncsi-cmd.c | 367 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ncsi/ncsi-pkt.h | 171 ++++++++++++++++++++++++ 4 files changed, 558 insertions(+), 1 deletion(-) create mode 100644 net/ncsi/ncsi-cmd.c create mode 100644 net/ncsi/ncsi-pkt.h (limited to 'net') diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile index 07b5625155d7..abc404631ca8 100644 --- a/net/ncsi/Makefile +++ b/net/ncsi/Makefile @@ -1,4 +1,4 @@ # # Makefile for NCSI API # -obj-$(CONFIG_NET_NCSI) += ncsi-manage.o +obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-manage.o diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 89028e1f83cd..3d81697a97d0 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -220,6 +220,21 @@ struct ncsi_dev_priv { struct list_head node; /* Form NCSI device list */ }; +struct ncsi_cmd_arg { + struct ncsi_dev_priv *ndp; /* Associated NCSI device */ + unsigned char type; /* Command in the NCSI packet */ + unsigned char id; /* Request ID (sequence number) */ + unsigned char package; /* Destination package ID */ + unsigned char channel; /* Detination channel ID or 0x1f */ + unsigned short payload; /* Command packet payload length */ + bool driven; /* Drive the state machine? */ + union { + unsigned char bytes[16]; /* Command packet specific data */ + unsigned short words[8]; + unsigned int dwords[4]; + }; +}; + extern struct list_head ncsi_dev_list; extern spinlock_t ncsi_dev_lock; @@ -253,4 +268,8 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven); void ncsi_free_request(struct ncsi_request *nr); struct ncsi_dev *ncsi_find_dev(struct net_device *dev); +/* Packet handlers */ +u32 ncsi_calculate_checksum(unsigned char *data, int len); +int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca); + #endif /* __NCSI_INTERNAL_H__ */ diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c new file mode 100644 index 000000000000..21057a8ceeac --- /dev/null +++ b/net/ncsi/ncsi-cmd.c @@ -0,0 +1,367 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "ncsi-pkt.h" + +u32 ncsi_calculate_checksum(unsigned char *data, int len) +{ + u32 checksum = 0; + int i; + + for (i = 0; i < len; i += 2) + checksum += (((u32)data[i] << 8) | data[i + 1]); + + checksum = (~checksum + 1); + return checksum; +} + +/* This function should be called after the data area has been + * populated completely. + */ +static void ncsi_cmd_build_header(struct ncsi_pkt_hdr *h, + struct ncsi_cmd_arg *nca) +{ + u32 checksum; + __be32 *pchecksum; + + h->mc_id = 0; + h->revision = NCSI_PKT_REVISION; + h->reserved = 0; + h->id = nca->id; + h->type = nca->type; + h->channel = NCSI_TO_CHANNEL(nca->package, + nca->channel); + h->length = htons(nca->payload); + h->reserved1[0] = 0; + h->reserved1[1] = 0; + + /* Fill with calculated checksum */ + checksum = ncsi_calculate_checksum((unsigned char *)h, + sizeof(*h) + nca->payload); + pchecksum = (__be32 *)((void *)h + sizeof(struct ncsi_pkt_hdr) + + nca->payload); + *pchecksum = htonl(checksum); +} + +static int ncsi_cmd_handler_default(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_pkt *cmd; + + cmd = (struct ncsi_cmd_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_sp(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_sp_pkt *cmd; + + cmd = (struct ncsi_cmd_sp_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->hw_arbitration = nca->bytes[0]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_dc(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_dc_pkt *cmd; + + cmd = (struct ncsi_cmd_dc_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->ald = nca->bytes[0]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_rc(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_rc_pkt *cmd; + + cmd = (struct ncsi_cmd_rc_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_ae(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_ae_pkt *cmd; + + cmd = (struct ncsi_cmd_ae_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mc_id = nca->bytes[0]; + cmd->mode = htonl(nca->dwords[1]); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_sl(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_sl_pkt *cmd; + + cmd = (struct ncsi_cmd_sl_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mode = htonl(nca->dwords[0]); + cmd->oem_mode = htonl(nca->dwords[1]); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_svf(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_svf_pkt *cmd; + + cmd = (struct ncsi_cmd_svf_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->vlan = htons(nca->words[0]); + cmd->index = nca->bytes[2]; + cmd->enable = nca->bytes[3]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_ev(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_ev_pkt *cmd; + + cmd = (struct ncsi_cmd_ev_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mode = nca->bytes[0]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_sma(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_sma_pkt *cmd; + int i; + + cmd = (struct ncsi_cmd_sma_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + for (i = 0; i < 6; i++) + cmd->mac[i] = nca->bytes[i]; + cmd->index = nca->bytes[6]; + cmd->at_e = nca->bytes[7]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_ebf(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_ebf_pkt *cmd; + + cmd = (struct ncsi_cmd_ebf_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mode = htonl(nca->dwords[0]); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_egmf(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_egmf_pkt *cmd; + + cmd = (struct ncsi_cmd_egmf_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mode = htonl(nca->dwords[0]); + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static int ncsi_cmd_handler_snfc(struct sk_buff *skb, + struct ncsi_cmd_arg *nca) +{ + struct ncsi_cmd_snfc_pkt *cmd; + + cmd = (struct ncsi_cmd_snfc_pkt *)skb_put(skb, sizeof(*cmd)); + memset(cmd, 0, sizeof(*cmd)); + cmd->mode = nca->bytes[0]; + ncsi_cmd_build_header(&cmd->cmd.common, nca); + + return 0; +} + +static struct ncsi_cmd_handler { + unsigned char type; + int payload; + int (*handler)(struct sk_buff *skb, + struct ncsi_cmd_arg *nca); +} ncsi_cmd_handlers[] = { + { NCSI_PKT_CMD_CIS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_SP, 4, ncsi_cmd_handler_sp }, + { NCSI_PKT_CMD_DP, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_EC, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_DC, 4, ncsi_cmd_handler_dc }, + { NCSI_PKT_CMD_RC, 4, ncsi_cmd_handler_rc }, + { NCSI_PKT_CMD_ECNT, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_DCNT, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_AE, 8, ncsi_cmd_handler_ae }, + { NCSI_PKT_CMD_SL, 8, ncsi_cmd_handler_sl }, + { NCSI_PKT_CMD_GLS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_SVF, 4, ncsi_cmd_handler_svf }, + { NCSI_PKT_CMD_EV, 4, ncsi_cmd_handler_ev }, + { NCSI_PKT_CMD_DV, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_SMA, 8, ncsi_cmd_handler_sma }, + { NCSI_PKT_CMD_EBF, 4, ncsi_cmd_handler_ebf }, + { NCSI_PKT_CMD_DBF, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_EGMF, 4, ncsi_cmd_handler_egmf }, + { NCSI_PKT_CMD_DGMF, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_SNFC, 4, ncsi_cmd_handler_snfc }, + { NCSI_PKT_CMD_GVI, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GC, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GP, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GCPS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GNS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GNPTS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GPS, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_OEM, 0, NULL }, + { NCSI_PKT_CMD_PLDM, 0, NULL }, + { NCSI_PKT_CMD_GPUUID, 0, ncsi_cmd_handler_default } +}; + +static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca) +{ + struct ncsi_dev_priv *ndp = nca->ndp; + struct ncsi_dev *nd = &ndp->ndev; + struct net_device *dev = nd->dev; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; + int len = hlen + tlen; + struct sk_buff *skb; + struct ncsi_request *nr; + + nr = ncsi_alloc_request(ndp, nca->driven); + if (!nr) + return NULL; + + /* NCSI command packet has 16-bytes header, payload, 4 bytes checksum. + * The packet needs padding if its payload is less than 26 bytes to + * meet 64 bytes minimal ethernet frame length. + */ + len += sizeof(struct ncsi_cmd_pkt_hdr) + 4; + if (nca->payload < 26) + len += 26; + else + len += nca->payload; + + /* Allocate skb */ + skb = alloc_skb(len, GFP_ATOMIC); + if (!skb) { + ncsi_free_request(nr); + return NULL; + } + + nr->cmd = skb; + skb_reserve(skb, hlen); + skb_reset_network_header(skb); + + skb->dev = dev; + skb->protocol = htons(ETH_P_NCSI); + + return nr; +} + +int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca) +{ + struct ncsi_request *nr; + struct ethhdr *eh; + struct ncsi_cmd_handler *nch = NULL; + int i, ret; + + /* Search for the handler */ + for (i = 0; i < ARRAY_SIZE(ncsi_cmd_handlers); i++) { + if (ncsi_cmd_handlers[i].type == nca->type) { + if (ncsi_cmd_handlers[i].handler) + nch = &ncsi_cmd_handlers[i]; + else + nch = NULL; + + break; + } + } + + if (!nch) { + netdev_err(nca->ndp->ndev.dev, + "Cannot send packet with type 0x%02x\n", nca->type); + return -ENOENT; + } + + /* Get packet payload length and allocate the request */ + nca->payload = nch->payload; + nr = ncsi_alloc_command(nca); + if (!nr) + return -ENOMEM; + + /* Prepare the packet */ + nca->id = nr->id; + ret = nch->handler(nr->cmd, nca); + if (ret) { + ncsi_free_request(nr); + return ret; + } + + /* Fill the ethernet header */ + eh = (struct ethhdr *)skb_push(nr->cmd, sizeof(*eh)); + eh->h_proto = htons(ETH_P_NCSI); + eth_broadcast_addr(eh->h_dest); + eth_broadcast_addr(eh->h_source); + + /* Start the timer for the request that might not have + * corresponding response. Given NCSI is an internal + * connection a 1 second delay should be sufficient. + */ + nr->enabled = true; + mod_timer(&nr->timer, jiffies + 1 * HZ); + + /* Send NCSI packet */ + skb_get(nr->cmd); + ret = dev_queue_xmit(nr->cmd); + if (ret < 0) { + ncsi_free_request(nr); + return ret; + } + + return 0; +} diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h new file mode 100644 index 000000000000..548145863c49 --- /dev/null +++ b/net/ncsi/ncsi-pkt.h @@ -0,0 +1,171 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __NCSI_PKT_H__ +#define __NCSI_PKT_H__ + +struct ncsi_pkt_hdr { + unsigned char mc_id; /* Management controller ID */ + unsigned char revision; /* NCSI version - 0x01 */ + unsigned char reserved; /* Reserved */ + unsigned char id; /* Packet sequence number */ + unsigned char type; /* Packet type */ + unsigned char channel; /* Network controller ID */ + __be16 length; /* Payload length */ + __be32 reserved1[2]; /* Reserved */ +}; + +struct ncsi_cmd_pkt_hdr { + struct ncsi_pkt_hdr common; /* Common NCSI packet header */ +}; + +/* NCSI common command packet */ +struct ncsi_cmd_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 checksum; /* Checksum */ + unsigned char pad[26]; +}; + +/* Select Package */ +struct ncsi_cmd_sp_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char reserved[3]; /* Reserved */ + unsigned char hw_arbitration; /* HW arbitration */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Disable Channel */ +struct ncsi_cmd_dc_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char reserved[3]; /* Reserved */ + unsigned char ald; /* Allow link down */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Reset Channel */ +struct ncsi_cmd_rc_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 reserved; /* Reserved */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* AEN Enable */ +struct ncsi_cmd_ae_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char reserved[3]; /* Reserved */ + unsigned char mc_id; /* MC ID */ + __be32 mode; /* AEN working mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[18]; +}; + +/* Set Link */ +struct ncsi_cmd_sl_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 mode; /* Link working mode */ + __be32 oem_mode; /* OEM link mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[18]; +}; + +/* Set VLAN Filter */ +struct ncsi_cmd_svf_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be16 reserved; /* Reserved */ + __be16 vlan; /* VLAN ID */ + __be16 reserved1; /* Reserved */ + unsigned char index; /* VLAN table index */ + unsigned char enable; /* Enable or disable */ + __be32 checksum; /* Checksum */ + unsigned char pad[14]; +}; + +/* Enable VLAN */ +struct ncsi_cmd_ev_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char reserved[3]; /* Reserved */ + unsigned char mode; /* VLAN filter mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Set MAC Address */ +struct ncsi_cmd_sma_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char mac[6]; /* MAC address */ + unsigned char index; /* MAC table index */ + unsigned char at_e; /* Addr type and operation */ + __be32 checksum; /* Checksum */ + unsigned char pad[18]; +}; + +/* Enable Broadcast Filter */ +struct ncsi_cmd_ebf_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 mode; /* Filter mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Enable Global Multicast Filter */ +struct ncsi_cmd_egmf_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + __be32 mode; /* Global MC mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* Set NCSI Flow Control */ +struct ncsi_cmd_snfc_pkt { + struct ncsi_cmd_pkt_hdr cmd; /* Command header */ + unsigned char reserved[3]; /* Reserved */ + unsigned char mode; /* Flow control mode */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* NCSI packet revision */ +#define NCSI_PKT_REVISION 0x01 + +/* NCSI packet commands */ +#define NCSI_PKT_CMD_CIS 0x00 /* Clear Initial State */ +#define NCSI_PKT_CMD_SP 0x01 /* Select Package */ +#define NCSI_PKT_CMD_DP 0x02 /* Deselect Package */ +#define NCSI_PKT_CMD_EC 0x03 /* Enable Channel */ +#define NCSI_PKT_CMD_DC 0x04 /* Disable Channel */ +#define NCSI_PKT_CMD_RC 0x05 /* Reset Channel */ +#define NCSI_PKT_CMD_ECNT 0x06 /* Enable Channel Network Tx */ +#define NCSI_PKT_CMD_DCNT 0x07 /* Disable Channel Network Tx */ +#define NCSI_PKT_CMD_AE 0x08 /* AEN Enable */ +#define NCSI_PKT_CMD_SL 0x09 /* Set Link */ +#define NCSI_PKT_CMD_GLS 0x0a /* Get Link */ +#define NCSI_PKT_CMD_SVF 0x0b /* Set VLAN Filter */ +#define NCSI_PKT_CMD_EV 0x0c /* Enable VLAN */ +#define NCSI_PKT_CMD_DV 0x0d /* Disable VLAN */ +#define NCSI_PKT_CMD_SMA 0x0e /* Set MAC address */ +#define NCSI_PKT_CMD_EBF 0x10 /* Enable Broadcast Filter */ +#define NCSI_PKT_CMD_DBF 0x11 /* Disable Broadcast Filter */ +#define NCSI_PKT_CMD_EGMF 0x12 /* Enable Global Multicast Filter */ +#define NCSI_PKT_CMD_DGMF 0x13 /* Disable Global Multicast Filter */ +#define NCSI_PKT_CMD_SNFC 0x14 /* Set NCSI Flow Control */ +#define NCSI_PKT_CMD_GVI 0x15 /* Get Version ID */ +#define NCSI_PKT_CMD_GC 0x16 /* Get Capabilities */ +#define NCSI_PKT_CMD_GP 0x17 /* Get Parameters */ +#define NCSI_PKT_CMD_GCPS 0x18 /* Get Controller Packet Statistics */ +#define NCSI_PKT_CMD_GNS 0x19 /* Get NCSI Statistics */ +#define NCSI_PKT_CMD_GNPTS 0x1a /* Get NCSI Pass-throu Statistics */ +#define NCSI_PKT_CMD_GPS 0x1b /* Get package status */ +#define NCSI_PKT_CMD_OEM 0x50 /* OEM */ +#define NCSI_PKT_CMD_PLDM 0x51 /* PLDM request over NCSI over RBT */ +#define NCSI_PKT_CMD_GPUUID 0x52 /* Get package UUID */ + +#endif /* __NCSI_PKT_H__ */ -- cgit v1.2.3 From 138635cc27c9737f940c3aa80912ff7a61c825af Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 19 Jul 2016 11:54:18 +1000 Subject: net/ncsi: NCSI response packet handler The NCSI response packets are sent to MC (Management Controller) from the remote end. They are responses of NCSI command packets for multiple purposes: completion status of NCSI command packets, return NCSI channel's capability or configuration etc. This defines struct to represent NCSI response packets and introduces function ncsi_rcv_rsp() which will be used to receive NCSI response packets and parse them. Signed-off-by: Gavin Shan Acked-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/Makefile | 2 +- net/ncsi/internal.h | 2 + net/ncsi/ncsi-pkt.h | 208 +++++++++++ net/ncsi/ncsi-rsp.c | 1016 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1227 insertions(+), 1 deletion(-) create mode 100644 net/ncsi/ncsi-rsp.c (limited to 'net') diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile index abc404631ca8..4751819e855b 100644 --- a/net/ncsi/Makefile +++ b/net/ncsi/Makefile @@ -1,4 +1,4 @@ # # Makefile for NCSI API # -obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-manage.o +obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-manage.o diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 3d81697a97d0..bd000c9c8249 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -271,5 +271,7 @@ struct ncsi_dev *ncsi_find_dev(struct net_device *dev); /* Packet handlers */ u32 ncsi_calculate_checksum(unsigned char *data, int len); int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca); +int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); #endif /* __NCSI_INTERNAL_H__ */ diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index 548145863c49..4bdefd988354 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -25,6 +25,12 @@ struct ncsi_cmd_pkt_hdr { struct ncsi_pkt_hdr common; /* Common NCSI packet header */ }; +struct ncsi_rsp_pkt_hdr { + struct ncsi_pkt_hdr common; /* Common NCSI packet header */ + __be16 code; /* Response code */ + __be16 reason; /* Response reason */ +}; + /* NCSI common command packet */ struct ncsi_cmd_pkt { struct ncsi_cmd_pkt_hdr cmd; /* Command header */ @@ -32,6 +38,12 @@ struct ncsi_cmd_pkt { unsigned char pad[26]; }; +struct ncsi_rsp_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + /* Select Package */ struct ncsi_cmd_sp_pkt { struct ncsi_cmd_pkt_hdr cmd; /* Command header */ @@ -133,6 +145,157 @@ struct ncsi_cmd_snfc_pkt { unsigned char pad[22]; }; +/* Get Link Status */ +struct ncsi_rsp_gls_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 status; /* Link status */ + __be32 other; /* Other indications */ + __be32 oem_status; /* OEM link status */ + __be32 checksum; + unsigned char pad[10]; +}; + +/* Get Version ID */ +struct ncsi_rsp_gvi_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 ncsi_version; /* NCSI version */ + unsigned char reserved[3]; /* Reserved */ + unsigned char alpha2; /* NCSI version */ + unsigned char fw_name[12]; /* f/w name string */ + __be32 fw_version; /* f/w version */ + __be16 pci_ids[4]; /* PCI IDs */ + __be32 mf_id; /* Manufacture ID */ + __be32 checksum; +}; + +/* Get Capabilities */ +struct ncsi_rsp_gc_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 cap; /* Capabilities */ + __be32 bc_cap; /* Broadcast cap */ + __be32 mc_cap; /* Multicast cap */ + __be32 buf_cap; /* Buffering cap */ + __be32 aen_cap; /* AEN cap */ + unsigned char vlan_cnt; /* VLAN filter count */ + unsigned char mixed_cnt; /* Mix filter count */ + unsigned char mc_cnt; /* MC filter count */ + unsigned char uc_cnt; /* UC filter count */ + unsigned char reserved[2]; /* Reserved */ + unsigned char vlan_mode; /* VLAN mode */ + unsigned char channel_cnt; /* Channel count */ + __be32 checksum; /* Checksum */ +}; + +/* Get Parameters */ +struct ncsi_rsp_gp_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + unsigned char mac_cnt; /* Number of MAC addr */ + unsigned char reserved[2]; /* Reserved */ + unsigned char mac_enable; /* MAC addr enable flags */ + unsigned char vlan_cnt; /* VLAN tag count */ + unsigned char reserved1; /* Reserved */ + __be16 vlan_enable; /* VLAN tag enable flags */ + __be32 link_mode; /* Link setting */ + __be32 bc_mode; /* BC filter mode */ + __be32 valid_modes; /* Valid mode parameters */ + unsigned char vlan_mode; /* VLAN mode */ + unsigned char fc_mode; /* Flow control mode */ + unsigned char reserved2[2]; /* Reserved */ + __be32 aen_mode; /* AEN mode */ + unsigned char mac[6]; /* Supported MAC addr */ + __be16 vlan; /* Supported VLAN tags */ + __be32 checksum; /* Checksum */ +}; + +/* Get Controller Packet Statistics */ +struct ncsi_rsp_gcps_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 cnt_hi; /* Counter cleared */ + __be32 cnt_lo; /* Counter cleared */ + __be32 rx_bytes; /* Rx bytes */ + __be32 tx_bytes; /* Tx bytes */ + __be32 rx_uc_pkts; /* Rx UC packets */ + __be32 rx_mc_pkts; /* Rx MC packets */ + __be32 rx_bc_pkts; /* Rx BC packets */ + __be32 tx_uc_pkts; /* Tx UC packets */ + __be32 tx_mc_pkts; /* Tx MC packets */ + __be32 tx_bc_pkts; /* Tx BC packets */ + __be32 fcs_err; /* FCS errors */ + __be32 align_err; /* Alignment errors */ + __be32 false_carrier; /* False carrier detection */ + __be32 runt_pkts; /* Rx runt packets */ + __be32 jabber_pkts; /* Rx jabber packets */ + __be32 rx_pause_xon; /* Rx pause XON frames */ + __be32 rx_pause_xoff; /* Rx XOFF frames */ + __be32 tx_pause_xon; /* Tx XON frames */ + __be32 tx_pause_xoff; /* Tx XOFF frames */ + __be32 tx_s_collision; /* Single collision frames */ + __be32 tx_m_collision; /* Multiple collision frames */ + __be32 l_collision; /* Late collision frames */ + __be32 e_collision; /* Excessive collision frames */ + __be32 rx_ctl_frames; /* Rx control frames */ + __be32 rx_64_frames; /* Rx 64-bytes frames */ + __be32 rx_127_frames; /* Rx 65-127 bytes frames */ + __be32 rx_255_frames; /* Rx 128-255 bytes frames */ + __be32 rx_511_frames; /* Rx 256-511 bytes frames */ + __be32 rx_1023_frames; /* Rx 512-1023 bytes frames */ + __be32 rx_1522_frames; /* Rx 1024-1522 bytes frames */ + __be32 rx_9022_frames; /* Rx 1523-9022 bytes frames */ + __be32 tx_64_frames; /* Tx 64-bytes frames */ + __be32 tx_127_frames; /* Tx 65-127 bytes frames */ + __be32 tx_255_frames; /* Tx 128-255 bytes frames */ + __be32 tx_511_frames; /* Tx 256-511 bytes frames */ + __be32 tx_1023_frames; /* Tx 512-1023 bytes frames */ + __be32 tx_1522_frames; /* Tx 1024-1522 bytes frames */ + __be32 tx_9022_frames; /* Tx 1523-9022 bytes frames */ + __be32 rx_valid_bytes; /* Rx valid bytes */ + __be32 rx_runt_pkts; /* Rx error runt packets */ + __be32 rx_jabber_pkts; /* Rx error jabber packets */ + __be32 checksum; /* Checksum */ +}; + +/* Get NCSI Statistics */ +struct ncsi_rsp_gns_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 rx_cmds; /* Rx NCSI commands */ + __be32 dropped_cmds; /* Dropped commands */ + __be32 cmd_type_errs; /* Command type errors */ + __be32 cmd_csum_errs; /* Command checksum errors */ + __be32 rx_pkts; /* Rx NCSI packets */ + __be32 tx_pkts; /* Tx NCSI packets */ + __be32 tx_aen_pkts; /* Tx AEN packets */ + __be32 checksum; /* Checksum */ +}; + +/* Get NCSI Pass-through Statistics */ +struct ncsi_rsp_gnpts_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 tx_pkts; /* Tx packets */ + __be32 tx_dropped; /* Tx dropped packets */ + __be32 tx_channel_err; /* Tx channel errors */ + __be32 tx_us_err; /* Tx undersize errors */ + __be32 rx_pkts; /* Rx packets */ + __be32 rx_dropped; /* Rx dropped packets */ + __be32 rx_channel_err; /* Rx channel errors */ + __be32 rx_us_err; /* Rx undersize errors */ + __be32 rx_os_err; /* Rx oversize errors */ + __be32 checksum; /* Checksum */ +}; + +/* Get package status */ +struct ncsi_rsp_gps_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + __be32 status; /* Hardware arbitration status */ + __be32 checksum; +}; + +/* Get package UUID */ +struct ncsi_rsp_gpuuid_pkt { + struct ncsi_rsp_pkt_hdr rsp; /* Response header */ + unsigned char uuid[16]; /* UUID */ + __be32 checksum; +}; + /* NCSI packet revision */ #define NCSI_PKT_REVISION 0x01 @@ -168,4 +331,49 @@ struct ncsi_cmd_snfc_pkt { #define NCSI_PKT_CMD_PLDM 0x51 /* PLDM request over NCSI over RBT */ #define NCSI_PKT_CMD_GPUUID 0x52 /* Get package UUID */ +/* NCSI packet responses */ +#define NCSI_PKT_RSP_CIS (NCSI_PKT_CMD_CIS + 0x80) +#define NCSI_PKT_RSP_SP (NCSI_PKT_CMD_SP + 0x80) +#define NCSI_PKT_RSP_DP (NCSI_PKT_CMD_DP + 0x80) +#define NCSI_PKT_RSP_EC (NCSI_PKT_CMD_EC + 0x80) +#define NCSI_PKT_RSP_DC (NCSI_PKT_CMD_DC + 0x80) +#define NCSI_PKT_RSP_RC (NCSI_PKT_CMD_RC + 0x80) +#define NCSI_PKT_RSP_ECNT (NCSI_PKT_CMD_ECNT + 0x80) +#define NCSI_PKT_RSP_DCNT (NCSI_PKT_CMD_DCNT + 0x80) +#define NCSI_PKT_RSP_AE (NCSI_PKT_CMD_AE + 0x80) +#define NCSI_PKT_RSP_SL (NCSI_PKT_CMD_SL + 0x80) +#define NCSI_PKT_RSP_GLS (NCSI_PKT_CMD_GLS + 0x80) +#define NCSI_PKT_RSP_SVF (NCSI_PKT_CMD_SVF + 0x80) +#define NCSI_PKT_RSP_EV (NCSI_PKT_CMD_EV + 0x80) +#define NCSI_PKT_RSP_DV (NCSI_PKT_CMD_DV + 0x80) +#define NCSI_PKT_RSP_SMA (NCSI_PKT_CMD_SMA + 0x80) +#define NCSI_PKT_RSP_EBF (NCSI_PKT_CMD_EBF + 0x80) +#define NCSI_PKT_RSP_DBF (NCSI_PKT_CMD_DBF + 0x80) +#define NCSI_PKT_RSP_EGMF (NCSI_PKT_CMD_EGMF + 0x80) +#define NCSI_PKT_RSP_DGMF (NCSI_PKT_CMD_DGMF + 0x80) +#define NCSI_PKT_RSP_SNFC (NCSI_PKT_CMD_SNFC + 0x80) +#define NCSI_PKT_RSP_GVI (NCSI_PKT_CMD_GVI + 0x80) +#define NCSI_PKT_RSP_GC (NCSI_PKT_CMD_GC + 0x80) +#define NCSI_PKT_RSP_GP (NCSI_PKT_CMD_GP + 0x80) +#define NCSI_PKT_RSP_GCPS (NCSI_PKT_CMD_GCPS + 0x80) +#define NCSI_PKT_RSP_GNS (NCSI_PKT_CMD_GNS + 0x80) +#define NCSI_PKT_RSP_GNPTS (NCSI_PKT_CMD_GNPTS + 0x80) +#define NCSI_PKT_RSP_GPS (NCSI_PKT_CMD_GPS + 0x80) +#define NCSI_PKT_RSP_OEM (NCSI_PKT_CMD_OEM + 0x80) +#define NCSI_PKT_RSP_PLDM (NCSI_PKT_CMD_PLDM + 0x80) +#define NCSI_PKT_RSP_GPUUID (NCSI_PKT_CMD_GPUUID + 0x80) + +/* NCSI response code/reason */ +#define NCSI_PKT_RSP_C_COMPLETED 0x0000 /* Command Completed */ +#define NCSI_PKT_RSP_C_FAILED 0x0001 /* Command Failed */ +#define NCSI_PKT_RSP_C_UNAVAILABLE 0x0002 /* Command Unavailable */ +#define NCSI_PKT_RSP_C_UNSUPPORTED 0x0003 /* Command Unsupported */ +#define NCSI_PKT_RSP_R_NO_ERROR 0x0000 /* No Error */ +#define NCSI_PKT_RSP_R_INTERFACE 0x0001 /* Interface not ready */ +#define NCSI_PKT_RSP_R_PARAM 0x0002 /* Invalid Parameter */ +#define NCSI_PKT_RSP_R_CHANNEL 0x0003 /* Channel not Ready */ +#define NCSI_PKT_RSP_R_PACKAGE 0x0004 /* Package not Ready */ +#define NCSI_PKT_RSP_R_LENGTH 0x0005 /* Invalid payload length */ +#define NCSI_PKT_RSP_R_UNKNOWN 0x7fff /* Command type unsupported */ + #endif /* __NCSI_PKT_H__ */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c new file mode 100644 index 000000000000..6ec25cb2608c --- /dev/null +++ b/net/ncsi/ncsi-rsp.c @@ -0,0 +1,1016 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "ncsi-pkt.h" + +static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, + unsigned short payload) +{ + struct ncsi_rsp_pkt_hdr *h; + u32 checksum; + __be32 *pchecksum; + + /* Check NCSI packet header. We don't need validate + * the packet type, which should have been checked + * before calling this function. + */ + h = (struct ncsi_rsp_pkt_hdr *)skb_network_header(nr->rsp); + if (h->common.revision != NCSI_PKT_REVISION) + return -EINVAL; + if (ntohs(h->common.length) != payload) + return -EINVAL; + + /* Check on code and reason */ + if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED || + ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) + return -EINVAL; + + /* Validate checksum, which might be zeroes if the + * sender doesn't support checksum according to NCSI + * specification. + */ + pchecksum = (__be32 *)((void *)(h + 1) + payload - 4); + if (ntohl(*pchecksum) == 0) + return 0; + + checksum = ncsi_calculate_checksum((unsigned char *)h, + sizeof(*h) + payload - 4); + if (*pchecksum != htonl(checksum)) + return -EINVAL; + + return 0; +} + +static int ncsi_rsp_handler_cis(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_package *np; + struct ncsi_channel *nc; + unsigned char id; + + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, &np, &nc); + if (!nc) { + id = NCSI_CHANNEL_INDEX(rsp->rsp.common.channel); + nc = ncsi_add_channel(np, id); + } + + return nc ? 0 : -ENODEV; +} + +static int ncsi_rsp_handler_sp(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_package *np; + unsigned char id; + + /* Add the package if it's not existing. Otherwise, + * to change the state of its child channels. + */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + &np, NULL); + if (!np) { + id = NCSI_PACKAGE_INDEX(rsp->rsp.common.channel); + np = ncsi_add_package(ndp, id); + if (!np) + return -ENODEV; + } + + return 0; +} + +static int ncsi_rsp_handler_dp(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_package *np; + struct ncsi_channel *nc; + unsigned long flags; + + /* Find the package */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + &np, NULL); + if (!np) + return -ENODEV; + + /* Change state of all channels attached to the package */ + NCSI_FOR_EACH_CHANNEL(np, nc) { + spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); + } + + return 0; +} + +static int ncsi_rsp_handler_ec(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + ncm = &nc->modes[NCSI_MODE_ENABLE]; + if (ncm->enable) + return -EBUSY; + + ncm->enable = 1; + return 0; +} + +static int ncsi_rsp_handler_dc(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + int ret; + + ret = ncsi_validate_rsp_pkt(nr, 4); + if (ret) + return ret; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + ncm = &nc->modes[NCSI_MODE_ENABLE]; + if (!ncm->enable) + return -EBUSY; + + ncm->enable = 0; + return 0; +} + +static int ncsi_rsp_handler_rc(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + unsigned long flags; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update state for the specified channel */ + spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); + + return 0; +} + +static int ncsi_rsp_handler_ecnt(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + ncm = &nc->modes[NCSI_MODE_TX_ENABLE]; + if (ncm->enable) + return -EBUSY; + + ncm->enable = 1; + return 0; +} + +static int ncsi_rsp_handler_dcnt(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + ncm = &nc->modes[NCSI_MODE_TX_ENABLE]; + if (!ncm->enable) + return -EBUSY; + + ncm->enable = 1; + return 0; +} + +static int ncsi_rsp_handler_ae(struct ncsi_request *nr) +{ + struct ncsi_cmd_ae_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if the AEN has been enabled */ + ncm = &nc->modes[NCSI_MODE_AEN]; + if (ncm->enable) + return -EBUSY; + + /* Update to AEN configuration */ + cmd = (struct ncsi_cmd_ae_pkt *)skb_network_header(nr->cmd); + ncm->enable = 1; + ncm->data[0] = cmd->mc_id; + ncm->data[1] = ntohl(cmd->mode); + + return 0; +} + +static int ncsi_rsp_handler_sl(struct ncsi_request *nr) +{ + struct ncsi_cmd_sl_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + cmd = (struct ncsi_cmd_sl_pkt *)skb_network_header(nr->cmd); + ncm = &nc->modes[NCSI_MODE_LINK]; + ncm->data[0] = ntohl(cmd->mode); + ncm->data[1] = ntohl(cmd->oem_mode); + + return 0; +} + +static int ncsi_rsp_handler_gls(struct ncsi_request *nr) +{ + struct ncsi_rsp_gls_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_gls_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + ncm = &nc->modes[NCSI_MODE_LINK]; + ncm->data[2] = ntohl(rsp->status); + ncm->data[3] = ntohl(rsp->other); + ncm->data[4] = ntohl(rsp->oem_status); + + return 0; +} + +static int ncsi_rsp_handler_svf(struct ncsi_request *nr) +{ + struct ncsi_cmd_svf_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_filter *ncf; + unsigned short vlan; + int ret; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + cmd = (struct ncsi_cmd_svf_pkt *)skb_network_header(nr->cmd); + ncf = nc->filters[NCSI_FILTER_VLAN]; + if (!ncf) + return -ENOENT; + if (cmd->index >= ncf->total) + return -ERANGE; + + /* Add or remove the VLAN filter */ + if (!(cmd->enable & 0x1)) { + ret = ncsi_remove_filter(nc, NCSI_FILTER_VLAN, cmd->index); + } else { + vlan = ntohs(cmd->vlan); + ret = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan); + } + + return ret; +} + +static int ncsi_rsp_handler_ev(struct ncsi_request *nr) +{ + struct ncsi_cmd_ev_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if VLAN mode has been enabled */ + ncm = &nc->modes[NCSI_MODE_VLAN]; + if (ncm->enable) + return -EBUSY; + + /* Update to VLAN mode */ + cmd = (struct ncsi_cmd_ev_pkt *)skb_network_header(nr->cmd); + ncm->enable = 1; + ncm->data[0] = ntohl(cmd->mode); + + return 0; +} + +static int ncsi_rsp_handler_dv(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if VLAN mode has been enabled */ + ncm = &nc->modes[NCSI_MODE_VLAN]; + if (!ncm->enable) + return -EBUSY; + + /* Update to VLAN mode */ + ncm->enable = 0; + return 0; +} + +static int ncsi_rsp_handler_sma(struct ncsi_request *nr) +{ + struct ncsi_cmd_sma_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_filter *ncf; + void *bitmap; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* According to NCSI spec 1.01, the mixed filter table + * isn't supported yet. + */ + cmd = (struct ncsi_cmd_sma_pkt *)skb_network_header(nr->cmd); + switch (cmd->at_e >> 5) { + case 0x0: /* UC address */ + ncf = nc->filters[NCSI_FILTER_UC]; + break; + case 0x1: /* MC address */ + ncf = nc->filters[NCSI_FILTER_MC]; + break; + default: + return -EINVAL; + } + + /* Sanity check on the filter */ + if (!ncf) + return -ENOENT; + else if (cmd->index >= ncf->total) + return -ERANGE; + + bitmap = &ncf->bitmap; + if (cmd->at_e & 0x1) { + if (test_and_set_bit(cmd->index, bitmap)) + return -EBUSY; + memcpy(ncf->data + 6 * cmd->index, cmd->mac, 6); + } else { + if (!test_and_clear_bit(cmd->index, bitmap)) + return -EBUSY; + + memset(ncf->data + 6 * cmd->index, 0, 6); + } + + return 0; +} + +static int ncsi_rsp_handler_ebf(struct ncsi_request *nr) +{ + struct ncsi_cmd_ebf_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the package and channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if broadcast filter has been enabled */ + ncm = &nc->modes[NCSI_MODE_BC]; + if (ncm->enable) + return -EBUSY; + + /* Update to broadcast filter mode */ + cmd = (struct ncsi_cmd_ebf_pkt *)skb_network_header(nr->cmd); + ncm->enable = 1; + ncm->data[0] = ntohl(cmd->mode); + + return 0; +} + +static int ncsi_rsp_handler_dbf(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if broadcast filter isn't enabled */ + ncm = &nc->modes[NCSI_MODE_BC]; + if (!ncm->enable) + return -EBUSY; + + /* Update to broadcast filter mode */ + ncm->enable = 0; + ncm->data[0] = 0; + + return 0; +} + +static int ncsi_rsp_handler_egmf(struct ncsi_request *nr) +{ + struct ncsi_cmd_egmf_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if multicast filter has been enabled */ + ncm = &nc->modes[NCSI_MODE_MC]; + if (ncm->enable) + return -EBUSY; + + /* Update to multicast filter mode */ + cmd = (struct ncsi_cmd_egmf_pkt *)skb_network_header(nr->cmd); + ncm->enable = 1; + ncm->data[0] = ntohl(cmd->mode); + + return 0; +} + +static int ncsi_rsp_handler_dgmf(struct ncsi_request *nr) +{ + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if multicast filter has been enabled */ + ncm = &nc->modes[NCSI_MODE_MC]; + if (!ncm->enable) + return -EBUSY; + + /* Update to multicast filter mode */ + ncm->enable = 0; + ncm->data[0] = 0; + + return 0; +} + +static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) +{ + struct ncsi_cmd_snfc_pkt *cmd; + struct ncsi_rsp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + + /* Find the channel */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Check if flow control has been enabled */ + ncm = &nc->modes[NCSI_MODE_FC]; + if (ncm->enable) + return -EBUSY; + + /* Update to flow control mode */ + cmd = (struct ncsi_cmd_snfc_pkt *)skb_network_header(nr->cmd); + ncm->enable = 1; + ncm->data[0] = cmd->mode; + + return 0; +} + +static int ncsi_rsp_handler_gvi(struct ncsi_request *nr) +{ + struct ncsi_rsp_gvi_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_version *ncv; + int i; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gvi_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update to channel's version info */ + ncv = &nc->version; + ncv->version = ntohl(rsp->ncsi_version); + ncv->alpha2 = rsp->alpha2; + memcpy(ncv->fw_name, rsp->fw_name, 12); + ncv->fw_version = ntohl(rsp->fw_version); + for (i = 0; i < ARRAY_SIZE(ncv->pci_ids); i++) + ncv->pci_ids[i] = ntohs(rsp->pci_ids[i]); + ncv->mf_id = ntohl(rsp->mf_id); + + return 0; +} + +static int ncsi_rsp_handler_gc(struct ncsi_request *nr) +{ + struct ncsi_rsp_gc_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_filter *ncf; + size_t size, entry_size; + int cnt, i; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gc_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update channel's capabilities */ + nc->caps[NCSI_CAP_GENERIC].cap = ntohl(rsp->cap) & + NCSI_CAP_GENERIC_MASK; + nc->caps[NCSI_CAP_BC].cap = ntohl(rsp->bc_cap) & + NCSI_CAP_BC_MASK; + nc->caps[NCSI_CAP_MC].cap = ntohl(rsp->mc_cap) & + NCSI_CAP_MC_MASK; + nc->caps[NCSI_CAP_BUFFER].cap = ntohl(rsp->buf_cap); + nc->caps[NCSI_CAP_AEN].cap = ntohl(rsp->aen_cap) & + NCSI_CAP_AEN_MASK; + nc->caps[NCSI_CAP_VLAN].cap = rsp->vlan_mode & + NCSI_CAP_VLAN_MASK; + + /* Build filters */ + for (i = 0; i < NCSI_FILTER_MAX; i++) { + switch (i) { + case NCSI_FILTER_VLAN: + cnt = rsp->vlan_cnt; + entry_size = 2; + break; + case NCSI_FILTER_MIXED: + cnt = rsp->mixed_cnt; + entry_size = 6; + break; + case NCSI_FILTER_MC: + cnt = rsp->mc_cnt; + entry_size = 6; + break; + case NCSI_FILTER_UC: + cnt = rsp->uc_cnt; + entry_size = 6; + break; + default: + continue; + } + + if (!cnt || nc->filters[i]) + continue; + + size = sizeof(*ncf) + cnt * entry_size; + ncf = kzalloc(size, GFP_ATOMIC); + if (!ncf) { + pr_warn("%s: Cannot alloc filter table (%d)\n", + __func__, i); + return -ENOMEM; + } + + ncf->index = i; + ncf->total = cnt; + ncf->bitmap = 0x0ul; + nc->filters[i] = ncf; + } + + return 0; +} + +static int ncsi_rsp_handler_gp(struct ncsi_request *nr) +{ + struct ncsi_rsp_gp_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + unsigned short enable, vlan; + unsigned char *pdata; + int table, i; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Modes with explicit enabled indications */ + if (ntohl(rsp->valid_modes) & 0x1) { /* BC filter mode */ + nc->modes[NCSI_MODE_BC].enable = 1; + nc->modes[NCSI_MODE_BC].data[0] = ntohl(rsp->bc_mode); + } + if (ntohl(rsp->valid_modes) & 0x2) /* Channel enabled */ + nc->modes[NCSI_MODE_ENABLE].enable = 1; + if (ntohl(rsp->valid_modes) & 0x4) /* Channel Tx enabled */ + nc->modes[NCSI_MODE_TX_ENABLE].enable = 1; + if (ntohl(rsp->valid_modes) & 0x8) /* MC filter mode */ + nc->modes[NCSI_MODE_MC].enable = 1; + + /* Modes without explicit enabled indications */ + nc->modes[NCSI_MODE_LINK].enable = 1; + nc->modes[NCSI_MODE_LINK].data[0] = ntohl(rsp->link_mode); + nc->modes[NCSI_MODE_VLAN].enable = 1; + nc->modes[NCSI_MODE_VLAN].data[0] = rsp->vlan_mode; + nc->modes[NCSI_MODE_FC].enable = 1; + nc->modes[NCSI_MODE_FC].data[0] = rsp->fc_mode; + nc->modes[NCSI_MODE_AEN].enable = 1; + nc->modes[NCSI_MODE_AEN].data[0] = ntohl(rsp->aen_mode); + + /* MAC addresses filter table */ + pdata = (unsigned char *)rsp + 48; + enable = rsp->mac_enable; + for (i = 0; i < rsp->mac_cnt; i++, pdata += 6) { + if (i >= (nc->filters[NCSI_FILTER_UC]->total + + nc->filters[NCSI_FILTER_MC]->total)) + table = NCSI_FILTER_MIXED; + else if (i >= nc->filters[NCSI_FILTER_UC]->total) + table = NCSI_FILTER_MC; + else + table = NCSI_FILTER_UC; + + if (!(enable & (0x1 << i))) + continue; + + if (ncsi_find_filter(nc, table, pdata) >= 0) + continue; + + ncsi_add_filter(nc, table, pdata); + } + + /* VLAN filter table */ + enable = ntohs(rsp->vlan_enable); + for (i = 0; i < rsp->vlan_cnt; i++, pdata += 2) { + if (!(enable & (0x1 << i))) + continue; + + vlan = ntohs(*(__be16 *)pdata); + if (ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan) >= 0) + continue; + + ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan); + } + + return 0; +} + +static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) +{ + struct ncsi_rsp_gcps_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_stats *ncs; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gcps_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update HNC's statistics */ + ncs = &nc->stats; + ncs->hnc_cnt_hi = ntohl(rsp->cnt_hi); + ncs->hnc_cnt_lo = ntohl(rsp->cnt_lo); + ncs->hnc_rx_bytes = ntohl(rsp->rx_bytes); + ncs->hnc_tx_bytes = ntohl(rsp->tx_bytes); + ncs->hnc_rx_uc_pkts = ntohl(rsp->rx_uc_pkts); + ncs->hnc_rx_mc_pkts = ntohl(rsp->rx_mc_pkts); + ncs->hnc_rx_bc_pkts = ntohl(rsp->rx_bc_pkts); + ncs->hnc_tx_uc_pkts = ntohl(rsp->tx_uc_pkts); + ncs->hnc_tx_mc_pkts = ntohl(rsp->tx_mc_pkts); + ncs->hnc_tx_bc_pkts = ntohl(rsp->tx_bc_pkts); + ncs->hnc_fcs_err = ntohl(rsp->fcs_err); + ncs->hnc_align_err = ntohl(rsp->align_err); + ncs->hnc_false_carrier = ntohl(rsp->false_carrier); + ncs->hnc_runt_pkts = ntohl(rsp->runt_pkts); + ncs->hnc_jabber_pkts = ntohl(rsp->jabber_pkts); + ncs->hnc_rx_pause_xon = ntohl(rsp->rx_pause_xon); + ncs->hnc_rx_pause_xoff = ntohl(rsp->rx_pause_xoff); + ncs->hnc_tx_pause_xon = ntohl(rsp->tx_pause_xon); + ncs->hnc_tx_pause_xoff = ntohl(rsp->tx_pause_xoff); + ncs->hnc_tx_s_collision = ntohl(rsp->tx_s_collision); + ncs->hnc_tx_m_collision = ntohl(rsp->tx_m_collision); + ncs->hnc_l_collision = ntohl(rsp->l_collision); + ncs->hnc_e_collision = ntohl(rsp->e_collision); + ncs->hnc_rx_ctl_frames = ntohl(rsp->rx_ctl_frames); + ncs->hnc_rx_64_frames = ntohl(rsp->rx_64_frames); + ncs->hnc_rx_127_frames = ntohl(rsp->rx_127_frames); + ncs->hnc_rx_255_frames = ntohl(rsp->rx_255_frames); + ncs->hnc_rx_511_frames = ntohl(rsp->rx_511_frames); + ncs->hnc_rx_1023_frames = ntohl(rsp->rx_1023_frames); + ncs->hnc_rx_1522_frames = ntohl(rsp->rx_1522_frames); + ncs->hnc_rx_9022_frames = ntohl(rsp->rx_9022_frames); + ncs->hnc_tx_64_frames = ntohl(rsp->tx_64_frames); + ncs->hnc_tx_127_frames = ntohl(rsp->tx_127_frames); + ncs->hnc_tx_255_frames = ntohl(rsp->tx_255_frames); + ncs->hnc_tx_511_frames = ntohl(rsp->tx_511_frames); + ncs->hnc_tx_1023_frames = ntohl(rsp->tx_1023_frames); + ncs->hnc_tx_1522_frames = ntohl(rsp->tx_1522_frames); + ncs->hnc_tx_9022_frames = ntohl(rsp->tx_9022_frames); + ncs->hnc_rx_valid_bytes = ntohl(rsp->rx_valid_bytes); + ncs->hnc_rx_runt_pkts = ntohl(rsp->rx_runt_pkts); + ncs->hnc_rx_jabber_pkts = ntohl(rsp->rx_jabber_pkts); + + return 0; +} + +static int ncsi_rsp_handler_gns(struct ncsi_request *nr) +{ + struct ncsi_rsp_gns_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_stats *ncs; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gns_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update HNC's statistics */ + ncs = &nc->stats; + ncs->ncsi_rx_cmds = ntohl(rsp->rx_cmds); + ncs->ncsi_dropped_cmds = ntohl(rsp->dropped_cmds); + ncs->ncsi_cmd_type_errs = ntohl(rsp->cmd_type_errs); + ncs->ncsi_cmd_csum_errs = ntohl(rsp->cmd_csum_errs); + ncs->ncsi_rx_pkts = ntohl(rsp->rx_pkts); + ncs->ncsi_tx_pkts = ntohl(rsp->tx_pkts); + ncs->ncsi_tx_aen_pkts = ntohl(rsp->tx_aen_pkts); + + return 0; +} + +static int ncsi_rsp_handler_gnpts(struct ncsi_request *nr) +{ + struct ncsi_rsp_gnpts_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_channel *nc; + struct ncsi_channel_stats *ncs; + + /* Find the channel */ + rsp = (struct ncsi_rsp_gnpts_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update HNC's statistics */ + ncs = &nc->stats; + ncs->pt_tx_pkts = ntohl(rsp->tx_pkts); + ncs->pt_tx_dropped = ntohl(rsp->tx_dropped); + ncs->pt_tx_channel_err = ntohl(rsp->tx_channel_err); + ncs->pt_tx_us_err = ntohl(rsp->tx_us_err); + ncs->pt_rx_pkts = ntohl(rsp->rx_pkts); + ncs->pt_rx_dropped = ntohl(rsp->rx_dropped); + ncs->pt_rx_channel_err = ntohl(rsp->rx_channel_err); + ncs->pt_rx_us_err = ntohl(rsp->rx_us_err); + ncs->pt_rx_os_err = ntohl(rsp->rx_os_err); + + return 0; +} + +static int ncsi_rsp_handler_gps(struct ncsi_request *nr) +{ + struct ncsi_rsp_gps_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_package *np; + + /* Find the package */ + rsp = (struct ncsi_rsp_gps_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + &np, NULL); + if (!np) + return -ENODEV; + + return 0; +} + +static int ncsi_rsp_handler_gpuuid(struct ncsi_request *nr) +{ + struct ncsi_rsp_gpuuid_pkt *rsp; + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_package *np; + + /* Find the package */ + rsp = (struct ncsi_rsp_gpuuid_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + &np, NULL); + if (!np) + return -ENODEV; + + memcpy(np->uuid, rsp->uuid, sizeof(rsp->uuid)); + + return 0; +} + +static struct ncsi_rsp_handler { + unsigned char type; + int payload; + int (*handler)(struct ncsi_request *nr); +} ncsi_rsp_handlers[] = { + { NCSI_PKT_RSP_CIS, 4, ncsi_rsp_handler_cis }, + { NCSI_PKT_RSP_SP, 4, ncsi_rsp_handler_sp }, + { NCSI_PKT_RSP_DP, 4, ncsi_rsp_handler_dp }, + { NCSI_PKT_RSP_EC, 4, ncsi_rsp_handler_ec }, + { NCSI_PKT_RSP_DC, 4, ncsi_rsp_handler_dc }, + { NCSI_PKT_RSP_RC, 4, ncsi_rsp_handler_rc }, + { NCSI_PKT_RSP_ECNT, 4, ncsi_rsp_handler_ecnt }, + { NCSI_PKT_RSP_DCNT, 4, ncsi_rsp_handler_dcnt }, + { NCSI_PKT_RSP_AE, 4, ncsi_rsp_handler_ae }, + { NCSI_PKT_RSP_SL, 4, ncsi_rsp_handler_sl }, + { NCSI_PKT_RSP_GLS, 16, ncsi_rsp_handler_gls }, + { NCSI_PKT_RSP_SVF, 4, ncsi_rsp_handler_svf }, + { NCSI_PKT_RSP_EV, 4, ncsi_rsp_handler_ev }, + { NCSI_PKT_RSP_DV, 4, ncsi_rsp_handler_dv }, + { NCSI_PKT_RSP_SMA, 4, ncsi_rsp_handler_sma }, + { NCSI_PKT_RSP_EBF, 4, ncsi_rsp_handler_ebf }, + { NCSI_PKT_RSP_DBF, 4, ncsi_rsp_handler_dbf }, + { NCSI_PKT_RSP_EGMF, 4, ncsi_rsp_handler_egmf }, + { NCSI_PKT_RSP_DGMF, 4, ncsi_rsp_handler_dgmf }, + { NCSI_PKT_RSP_SNFC, 4, ncsi_rsp_handler_snfc }, + { NCSI_PKT_RSP_GVI, 36, ncsi_rsp_handler_gvi }, + { NCSI_PKT_RSP_GC, 32, ncsi_rsp_handler_gc }, + { NCSI_PKT_RSP_GP, -1, ncsi_rsp_handler_gp }, + { NCSI_PKT_RSP_GCPS, 172, ncsi_rsp_handler_gcps }, + { NCSI_PKT_RSP_GNS, 172, ncsi_rsp_handler_gns }, + { NCSI_PKT_RSP_GNPTS, 172, ncsi_rsp_handler_gnpts }, + { NCSI_PKT_RSP_GPS, 8, ncsi_rsp_handler_gps }, + { NCSI_PKT_RSP_OEM, 0, NULL }, + { NCSI_PKT_RSP_PLDM, 0, NULL }, + { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid } +}; + +int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct ncsi_rsp_handler *nrh = NULL; + struct ncsi_dev *nd; + struct ncsi_dev_priv *ndp; + struct ncsi_request *nr; + struct ncsi_pkt_hdr *hdr; + unsigned long flags; + int payload, i, ret; + + /* Find the NCSI device */ + nd = ncsi_find_dev(dev); + ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; + if (!ndp) + return -ENODEV; + + /* Find the handler */ + hdr = (struct ncsi_pkt_hdr *)skb_network_header(skb); + for (i = 0; i < ARRAY_SIZE(ncsi_rsp_handlers); i++) { + if (ncsi_rsp_handlers[i].type == hdr->type) { + if (ncsi_rsp_handlers[i].handler) + nrh = &ncsi_rsp_handlers[i]; + else + nrh = NULL; + + break; + } + } + + if (!nrh) { + netdev_err(nd->dev, "Received unrecognized packet (0x%x)\n", + hdr->type); + return -ENOENT; + } + + /* Associate with the request */ + spin_lock_irqsave(&ndp->lock, flags); + nr = &ndp->requests[hdr->id]; + if (!nr->used) { + spin_unlock_irqrestore(&ndp->lock, flags); + return -ENODEV; + } + + nr->rsp = skb; + if (!nr->enabled) { + spin_unlock_irqrestore(&ndp->lock, flags); + ret = -ENOENT; + goto out; + } + + /* Validate the packet */ + spin_unlock_irqrestore(&ndp->lock, flags); + payload = nrh->payload; + if (payload < 0) + payload = ntohs(hdr->length); + ret = ncsi_validate_rsp_pkt(nr, payload); + if (ret) + goto out; + + /* Process the packet */ + ret = nrh->handler(nr); +out: + ncsi_free_request(nr); + return ret; +} -- cgit v1.2.3 From e6f44ed6d04d3185dcd8e8e98af8742d87bdffcc Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 19 Jul 2016 11:54:19 +1000 Subject: net/ncsi: Package and channel management This manages NCSI packages and channels: * The available packages and channels are enumerated in the first time of calling ncsi_start_dev(). The channels' capabilities are probed in the meanwhile. The NCSI network topology won't change until the NCSI device is destroyed. * There in a queue in every NCSI device. The element in the queue, channel, is waiting for configuration (bringup) or suspending (teardown). The channel's state (inactive/active) indicates the futher action (configuration or suspending) will be applied on the channel. Another channel's state (invisible) means the requested action is being applied. * The hardware arbitration will be enabled if all available packages and channels support it. All available channels try to provide service when hardware arbitration is enabled. Otherwise, one channel is selected as the active one at once. * When channel is in active state, meaning it's providing service, a timer started to retrieve the channe's link status. If the channel's link status fails to be updated in the determined period, the channel is going to be reconfigured. It's the error handling implementation as defined in NCSI spec. Signed-off-by: Gavin Shan Acked-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/internal.h | 50 ++++ net/ncsi/ncsi-manage.c | 763 +++++++++++++++++++++++++++++++++++++++++++++++++ net/ncsi/ncsi-rsp.c | 15 + 3 files changed, 828 insertions(+) (limited to 'net') diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index bd000c9c8249..38fc95a26f8f 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -178,6 +178,7 @@ struct ncsi_channel { int state; #define NCSI_CHANNEL_INACTIVE 1 #define NCSI_CHANNEL_ACTIVE 2 +#define NCSI_CHANNEL_INVISIBLE 3 spinlock_t lock; /* Protect filters etc */ struct ncsi_package *package; struct ncsi_channel_version version; @@ -185,7 +186,11 @@ struct ncsi_channel { struct ncsi_channel_mode modes[NCSI_MODE_MAX]; struct ncsi_channel_filter *filters[NCSI_FILTER_MAX]; struct ncsi_channel_stats stats; + struct timer_list timer; /* Link monitor timer */ + bool enabled; /* Timer is enabled */ + unsigned int timeout; /* Times of timeout */ struct list_head node; + struct list_head link; }; struct ncsi_package { @@ -209,14 +214,56 @@ struct ncsi_request { bool enabled; /* Time has been enabled or not */ }; +enum { + ncsi_dev_state_major = 0xff00, + ncsi_dev_state_minor = 0x00ff, + ncsi_dev_state_probe_deselect = 0x0201, + ncsi_dev_state_probe_package, + ncsi_dev_state_probe_channel, + ncsi_dev_state_probe_cis, + ncsi_dev_state_probe_gvi, + ncsi_dev_state_probe_gc, + ncsi_dev_state_probe_gls, + ncsi_dev_state_probe_dp, + ncsi_dev_state_config_sp = 0x0301, + ncsi_dev_state_config_cis, + ncsi_dev_state_config_sma, + ncsi_dev_state_config_ebf, +#if IS_ENABLED(CONFIG_IPV6) + ncsi_dev_state_config_egmf, +#endif + ncsi_dev_state_config_ecnt, + ncsi_dev_state_config_ec, + ncsi_dev_state_config_ae, + ncsi_dev_state_config_gls, + ncsi_dev_state_config_done, + ncsi_dev_state_suspend_select = 0x0401, + ncsi_dev_state_suspend_dcnt, + ncsi_dev_state_suspend_dc, + ncsi_dev_state_suspend_deselect, + ncsi_dev_state_suspend_done +}; + struct ncsi_dev_priv { struct ncsi_dev ndev; /* Associated NCSI device */ unsigned int flags; /* NCSI device flags */ +#define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */ +#define NCSI_DEV_HWA 2 /* Enabled HW arbitration */ +#define NCSI_DEV_RESHUFFLE 4 spinlock_t lock; /* Protect the NCSI device */ +#if IS_ENABLED(CONFIG_IPV6) + unsigned int inet6_addr_num; /* Number of IPv6 addresses */ +#endif unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ struct ncsi_request requests[256]; /* Request table */ unsigned int request_id; /* Last used request ID */ + unsigned int pending_req_num; /* Number of pending requests */ + struct ncsi_package *active_package; /* Currently handled package */ + struct ncsi_channel *active_channel; /* Currently handled channel */ + struct list_head channel_queue; /* Config queue of channels */ + struct work_struct work; /* For channel management */ + struct packet_type ptype; /* NCSI packet Rx handler */ struct list_head node; /* Form NCSI device list */ }; @@ -251,6 +298,8 @@ extern spinlock_t ncsi_dev_lock; int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data); int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data); int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index); +void ncsi_start_channel_monitor(struct ncsi_channel *nc); +void ncsi_stop_channel_monitor(struct ncsi_channel *nc); struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, unsigned char id); struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, @@ -267,6 +316,7 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven); void ncsi_free_request(struct ncsi_request *nr); struct ncsi_dev *ncsi_find_dev(struct net_device *dev); +int ncsi_process_next_channel(struct ncsi_dev_priv *ndp); /* Packet handlers */ u32 ncsi_calculate_checksum(unsigned char *data, int len); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 0e28ed8f2703..d627a39ddcd0 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -17,8 +17,12 @@ #include #include #include +#include +#include +#include #include "internal.h" +#include "ncsi-pkt.h" LIST_HEAD(ncsi_dev_list); DEFINE_SPINLOCK(ncsi_dev_lock); @@ -123,6 +127,120 @@ int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index) return 0; } +static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct ncsi_package *np; + struct ncsi_channel *nc; + + nd->state = ncsi_dev_state_functional; + if (force_down) { + nd->link_up = 0; + goto report; + } + + nd->link_up = 0; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (!list_empty(&nc->link) || + nc->state != NCSI_CHANNEL_ACTIVE) + continue; + + if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { + nd->link_up = 1; + goto report; + } + } + } + +report: + nd->handler(nd); +} + +static void ncsi_channel_monitor(unsigned long data) +{ + struct ncsi_channel *nc = (struct ncsi_channel *)data; + struct ncsi_package *np = nc->package; + struct ncsi_dev_priv *ndp = np->ndp; + struct ncsi_cmd_arg nca; + bool enabled; + unsigned int timeout; + unsigned long flags; + int ret; + + spin_lock_irqsave(&nc->lock, flags); + timeout = nc->timeout; + enabled = nc->enabled; + spin_unlock_irqrestore(&nc->lock, flags); + + if (!enabled || !list_empty(&nc->link)) + return; + if (nc->state != NCSI_CHANNEL_INACTIVE && + nc->state != NCSI_CHANNEL_ACTIVE) + return; + + if (!(timeout % 2)) { + nca.ndp = ndp; + nca.package = np->id; + nca.channel = nc->id; + nca.type = NCSI_PKT_CMD_GLS; + nca.driven = false; + ret = ncsi_xmit_cmd(&nca); + if (ret) { + netdev_err(ndp->ndev.dev, "Error %d sending GLS\n", + ret); + return; + } + } + + if (timeout + 1 >= 3) { + if (!(ndp->flags & NCSI_DEV_HWA) && + nc->state == NCSI_CHANNEL_ACTIVE) + ncsi_report_link(ndp, true); + + spin_lock_irqsave(&ndp->lock, flags); + xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + ncsi_process_next_channel(ndp); + return; + } + + spin_lock_irqsave(&nc->lock, flags); + nc->timeout = timeout + 1; + nc->enabled = true; + spin_unlock_irqrestore(&nc->lock, flags); + mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); +} + +void ncsi_start_channel_monitor(struct ncsi_channel *nc) +{ + unsigned long flags; + + spin_lock_irqsave(&nc->lock, flags); + WARN_ON_ONCE(nc->enabled); + nc->timeout = 0; + nc->enabled = true; + spin_unlock_irqrestore(&nc->lock, flags); + + mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); +} + +void ncsi_stop_channel_monitor(struct ncsi_channel *nc) +{ + unsigned long flags; + + spin_lock_irqsave(&nc->lock, flags); + if (!nc->enabled) { + spin_unlock_irqrestore(&nc->lock, flags); + return; + } + nc->enabled = false; + spin_unlock_irqrestore(&nc->lock, flags); + + del_timer_sync(&nc->timer); +} + struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, unsigned char id) { @@ -149,7 +267,10 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id) nc->id = id; nc->package = np; nc->state = NCSI_CHANNEL_INACTIVE; + nc->enabled = false; + setup_timer(&nc->timer, ncsi_channel_monitor, (unsigned long)nc); spin_lock_init(&nc->lock); + INIT_LIST_HEAD(&nc->link); for (index = 0; index < NCSI_CAP_MAX; index++) nc->caps[index].index = index; for (index = 0; index < NCSI_MODE_MAX; index++) @@ -190,6 +311,7 @@ static void ncsi_remove_channel(struct ncsi_channel *nc) nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); + ncsi_stop_channel_monitor(nc); /* Remove and free channel */ spin_lock_irqsave(&np->lock, flags); @@ -323,6 +445,7 @@ void ncsi_free_request(struct ncsi_request *nr) struct ncsi_dev_priv *ndp = nr->ndp; struct sk_buff *cmd, *rsp; unsigned long flags; + bool driven; if (nr->enabled) { nr->enabled = false; @@ -335,8 +458,12 @@ void ncsi_free_request(struct ncsi_request *nr) nr->cmd = NULL; nr->rsp = NULL; nr->used = false; + driven = nr->driven; spin_unlock_irqrestore(&ndp->lock, flags); + if (driven && cmd && --ndp->pending_req_num == 0) + schedule_work(&ndp->work); + /* Release command and response */ consume_skb(cmd); consume_skb(rsp); @@ -375,6 +502,587 @@ static void ncsi_request_timeout(unsigned long data) ncsi_free_request(nr); } +static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct ncsi_package *np = ndp->active_package; + struct ncsi_channel *nc = ndp->active_channel; + struct ncsi_cmd_arg nca; + int ret; + + nca.ndp = ndp; + nca.driven = true; + switch (nd->state) { + case ncsi_dev_state_suspend: + nd->state = ncsi_dev_state_suspend_select; + /* Fall through */ + case ncsi_dev_state_suspend_select: + case ncsi_dev_state_suspend_dcnt: + case ncsi_dev_state_suspend_dc: + case ncsi_dev_state_suspend_deselect: + ndp->pending_req_num = 1; + + np = ndp->active_package; + nc = ndp->active_channel; + nca.package = np->id; + if (nd->state == ncsi_dev_state_suspend_select) { + nca.type = NCSI_PKT_CMD_SP; + nca.channel = 0x1f; + if (ndp->flags & NCSI_DEV_HWA) + nca.bytes[0] = 0; + else + nca.bytes[0] = 1; + nd->state = ncsi_dev_state_suspend_dcnt; + } else if (nd->state == ncsi_dev_state_suspend_dcnt) { + nca.type = NCSI_PKT_CMD_DCNT; + nca.channel = nc->id; + nd->state = ncsi_dev_state_suspend_dc; + } else if (nd->state == ncsi_dev_state_suspend_dc) { + nca.type = NCSI_PKT_CMD_DC; + nca.channel = nc->id; + nca.bytes[0] = 1; + nd->state = ncsi_dev_state_suspend_deselect; + } else if (nd->state == ncsi_dev_state_suspend_deselect) { + nca.type = NCSI_PKT_CMD_DP; + nca.channel = 0x1f; + nd->state = ncsi_dev_state_suspend_done; + } + + ret = ncsi_xmit_cmd(&nca); + if (ret) { + nd->state = ncsi_dev_state_functional; + return; + } + + break; + case ncsi_dev_state_suspend_done: + xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + ncsi_process_next_channel(ndp); + + break; + default: + netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n", + nd->state); + } +} + +static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct net_device *dev = nd->dev; + struct ncsi_package *np = ndp->active_package; + struct ncsi_channel *nc = ndp->active_channel; + struct ncsi_cmd_arg nca; + unsigned char index; + int ret; + + nca.ndp = ndp; + nca.driven = true; + switch (nd->state) { + case ncsi_dev_state_config: + case ncsi_dev_state_config_sp: + ndp->pending_req_num = 1; + + /* Select the specific package */ + nca.type = NCSI_PKT_CMD_SP; + if (ndp->flags & NCSI_DEV_HWA) + nca.bytes[0] = 0; + else + nca.bytes[0] = 1; + nca.package = np->id; + nca.channel = 0x1f; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_config_cis; + break; + case ncsi_dev_state_config_cis: + ndp->pending_req_num = 1; + + /* Clear initial state */ + nca.type = NCSI_PKT_CMD_CIS; + nca.package = np->id; + nca.channel = nc->id; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_config_sma; + break; + case ncsi_dev_state_config_sma: + case ncsi_dev_state_config_ebf: +#if IS_ENABLED(CONFIG_IPV6) + case ncsi_dev_state_config_egmf: +#endif + case ncsi_dev_state_config_ecnt: + case ncsi_dev_state_config_ec: + case ncsi_dev_state_config_ae: + case ncsi_dev_state_config_gls: + ndp->pending_req_num = 1; + + nca.package = np->id; + nca.channel = nc->id; + + /* Use first entry in unicast filter table. Note that + * the MAC filter table starts from entry 1 instead of + * 0. + */ + if (nd->state == ncsi_dev_state_config_sma) { + nca.type = NCSI_PKT_CMD_SMA; + for (index = 0; index < 6; index++) + nca.bytes[index] = dev->dev_addr[index]; + nca.bytes[6] = 0x1; + nca.bytes[7] = 0x1; + nd->state = ncsi_dev_state_config_ebf; + } else if (nd->state == ncsi_dev_state_config_ebf) { + nca.type = NCSI_PKT_CMD_EBF; + nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap; + nd->state = ncsi_dev_state_config_ecnt; +#if IS_ENABLED(CONFIG_IPV6) + if (ndp->inet6_addr_num > 0 && + (nc->caps[NCSI_CAP_GENERIC].cap & + NCSI_CAP_GENERIC_MC)) + nd->state = ncsi_dev_state_config_egmf; + else + nd->state = ncsi_dev_state_config_ecnt; + } else if (nd->state == ncsi_dev_state_config_egmf) { + nca.type = NCSI_PKT_CMD_EGMF; + nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; + nd->state = ncsi_dev_state_config_ecnt; +#endif /* CONFIG_IPV6 */ + } else if (nd->state == ncsi_dev_state_config_ecnt) { + nca.type = NCSI_PKT_CMD_ECNT; + nd->state = ncsi_dev_state_config_ec; + } else if (nd->state == ncsi_dev_state_config_ec) { + /* Enable AEN if it's supported */ + nca.type = NCSI_PKT_CMD_EC; + nd->state = ncsi_dev_state_config_ae; + if (!(nc->caps[NCSI_CAP_AEN].cap & NCSI_CAP_AEN_MASK)) + nd->state = ncsi_dev_state_config_gls; + } else if (nd->state == ncsi_dev_state_config_ae) { + nca.type = NCSI_PKT_CMD_AE; + nca.bytes[0] = 0; + nca.dwords[1] = nc->caps[NCSI_CAP_AEN].cap; + nd->state = ncsi_dev_state_config_gls; + } else if (nd->state == ncsi_dev_state_config_gls) { + nca.type = NCSI_PKT_CMD_GLS; + nd->state = ncsi_dev_state_config_done; + } + + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + break; + case ncsi_dev_state_config_done: + if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) + xchg(&nc->state, NCSI_CHANNEL_ACTIVE); + else + xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + + ncsi_start_channel_monitor(nc); + ncsi_process_next_channel(ndp); + break; + default: + netdev_warn(dev, "Wrong NCSI state 0x%x in config\n", + nd->state); + } + + return; + +error: + ncsi_report_link(ndp, true); +} + +static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_package *np; + struct ncsi_channel *nc, *found; + struct ncsi_channel_mode *ncm; + unsigned long flags; + + /* The search is done once an inactive channel with up + * link is found. + */ + found = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (!list_empty(&nc->link) || + nc->state != NCSI_CHANNEL_INACTIVE) + continue; + + if (!found) + found = nc; + + ncm = &nc->modes[NCSI_MODE_LINK]; + if (ncm->data[2] & 0x1) { + found = nc; + goto out; + } + } + } + + if (!found) { + ncsi_report_link(ndp, true); + return -ENODEV; + } + +out: + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&found->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + return ncsi_process_next_channel(ndp); +} + +static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp) +{ + struct ncsi_package *np; + struct ncsi_channel *nc; + unsigned int cap; + + /* The hardware arbitration is disabled if any one channel + * doesn't support explicitly. + */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + cap = nc->caps[NCSI_CAP_GENERIC].cap; + if (!(cap & NCSI_CAP_GENERIC_HWA) || + (cap & NCSI_CAP_GENERIC_HWA_MASK) != + NCSI_CAP_GENERIC_HWA_SUPPORT) { + ndp->flags &= ~NCSI_DEV_HWA; + return false; + } + } + } + + ndp->flags |= NCSI_DEV_HWA; + return true; +} + +static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp) +{ + struct ncsi_package *np; + struct ncsi_channel *nc; + unsigned long flags; + + /* Move all available channels to processing queue */ + spin_lock_irqsave(&ndp->lock, flags); + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + WARN_ON_ONCE(nc->state != NCSI_CHANNEL_INACTIVE || + !list_empty(&nc->link)); + ncsi_stop_channel_monitor(nc); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + } + } + spin_unlock_irqrestore(&ndp->lock, flags); + + /* We can have no channels in extremely case */ + if (list_empty(&ndp->channel_queue)) { + ncsi_report_link(ndp, false); + return -ENOENT; + } + + return ncsi_process_next_channel(ndp); +} + +static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct ncsi_package *np; + struct ncsi_channel *nc; + struct ncsi_cmd_arg nca; + unsigned char index; + int ret; + + nca.ndp = ndp; + nca.driven = true; + switch (nd->state) { + case ncsi_dev_state_probe: + nd->state = ncsi_dev_state_probe_deselect; + /* Fall through */ + case ncsi_dev_state_probe_deselect: + ndp->pending_req_num = 8; + + /* Deselect all possible packages */ + nca.type = NCSI_PKT_CMD_DP; + nca.channel = 0x1f; + for (index = 0; index < 8; index++) { + nca.package = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + nd->state = ncsi_dev_state_probe_package; + break; + case ncsi_dev_state_probe_package: + ndp->pending_req_num = 16; + + /* Select all possible packages */ + nca.type = NCSI_PKT_CMD_SP; + nca.bytes[0] = 1; + nca.channel = 0x1f; + for (index = 0; index < 8; index++) { + nca.package = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + /* Disable all possible packages */ + nca.type = NCSI_PKT_CMD_DP; + for (index = 0; index < 8; index++) { + nca.package = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + nd->state = ncsi_dev_state_probe_channel; + break; + case ncsi_dev_state_probe_channel: + if (!ndp->active_package) + ndp->active_package = list_first_or_null_rcu( + &ndp->packages, struct ncsi_package, node); + else if (list_is_last(&ndp->active_package->node, + &ndp->packages)) + ndp->active_package = NULL; + else + ndp->active_package = list_next_entry( + ndp->active_package, node); + + /* All available packages and channels are enumerated. The + * enumeration happens for once when the NCSI interface is + * started. So we need continue to start the interface after + * the enumeration. + * + * We have to choose an active channel before configuring it. + * Note that we possibly don't have active channel in extreme + * situation. + */ + if (!ndp->active_package) { + ndp->flags |= NCSI_DEV_PROBED; + if (ncsi_check_hwa(ndp)) + ncsi_enable_hwa(ndp); + else + ncsi_choose_active_channel(ndp); + return; + } + + /* Select the active package */ + ndp->pending_req_num = 1; + nca.type = NCSI_PKT_CMD_SP; + nca.bytes[0] = 1; + nca.package = ndp->active_package->id; + nca.channel = 0x1f; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_probe_cis; + break; + case ncsi_dev_state_probe_cis: + ndp->pending_req_num = 32; + + /* Clear initial state */ + nca.type = NCSI_PKT_CMD_CIS; + nca.package = ndp->active_package->id; + for (index = 0; index < 0x20; index++) { + nca.channel = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + nd->state = ncsi_dev_state_probe_gvi; + break; + case ncsi_dev_state_probe_gvi: + case ncsi_dev_state_probe_gc: + case ncsi_dev_state_probe_gls: + np = ndp->active_package; + ndp->pending_req_num = np->channel_num; + + /* Retrieve version, capability or link status */ + if (nd->state == ncsi_dev_state_probe_gvi) + nca.type = NCSI_PKT_CMD_GVI; + else if (nd->state == ncsi_dev_state_probe_gc) + nca.type = NCSI_PKT_CMD_GC; + else + nca.type = NCSI_PKT_CMD_GLS; + + nca.package = np->id; + NCSI_FOR_EACH_CHANNEL(np, nc) { + nca.channel = nc->id; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + if (nd->state == ncsi_dev_state_probe_gvi) + nd->state = ncsi_dev_state_probe_gc; + else if (nd->state == ncsi_dev_state_probe_gc) + nd->state = ncsi_dev_state_probe_gls; + else + nd->state = ncsi_dev_state_probe_dp; + break; + case ncsi_dev_state_probe_dp: + ndp->pending_req_num = 1; + + /* Deselect the active package */ + nca.type = NCSI_PKT_CMD_DP; + nca.package = ndp->active_package->id; + nca.channel = 0x1f; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + /* Scan channels in next package */ + nd->state = ncsi_dev_state_probe_channel; + break; + default: + netdev_warn(nd->dev, "Wrong NCSI state 0x%0x in enumeration\n", + nd->state); + } + + return; +error: + ncsi_report_link(ndp, true); +} + +static void ncsi_dev_work(struct work_struct *work) +{ + struct ncsi_dev_priv *ndp = container_of(work, + struct ncsi_dev_priv, work); + struct ncsi_dev *nd = &ndp->ndev; + + switch (nd->state & ncsi_dev_state_major) { + case ncsi_dev_state_probe: + ncsi_probe_channel(ndp); + break; + case ncsi_dev_state_suspend: + ncsi_suspend_channel(ndp); + break; + case ncsi_dev_state_config: + ncsi_configure_channel(ndp); + break; + default: + netdev_warn(nd->dev, "Wrong NCSI state 0x%x in workqueue\n", + nd->state); + } +} + +int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_channel *nc; + int old_state; + unsigned long flags; + + spin_lock_irqsave(&ndp->lock, flags); + nc = list_first_or_null_rcu(&ndp->channel_queue, + struct ncsi_channel, link); + if (nc) { + old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE); + list_del_init(&nc->link); + } + spin_unlock_irqrestore(&ndp->lock, flags); + + ndp->active_channel = nc; + ndp->active_package = nc ? nc->package : NULL; + if (!nc) { + if (ndp->flags & NCSI_DEV_RESHUFFLE) { + ndp->flags &= ~NCSI_DEV_RESHUFFLE; + return ncsi_choose_active_channel(ndp); + } + + ncsi_report_link(ndp, false); + return -ENODEV; + } + + switch (old_state) { + case NCSI_CHANNEL_INACTIVE: + ndp->ndev.state = ncsi_dev_state_config; + ncsi_configure_channel(ndp); + break; + case NCSI_CHANNEL_ACTIVE: + ndp->ndev.state = ncsi_dev_state_suspend; + ncsi_suspend_channel(ndp); + break; + default: + netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n", + nc->state, nc->package->id, nc->id); + ncsi_report_link(ndp, false); + return -EINVAL; + } + + return 0; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int ncsi_inet6addr_event(struct notifier_block *this, + unsigned long event, void *data) +{ + struct inet6_ifaddr *ifa = data; + struct net_device *dev = ifa->idev->dev; + struct ncsi_dev *nd = ncsi_find_dev(dev); + struct ncsi_dev_priv *ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; + struct ncsi_package *np; + struct ncsi_channel *nc; + struct ncsi_cmd_arg nca; + bool action; + int ret; + + if (!ndp || (ipv6_addr_type(&ifa->addr) & + (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK))) + return NOTIFY_OK; + + switch (event) { + case NETDEV_UP: + action = (++ndp->inet6_addr_num) == 1; + nca.type = NCSI_PKT_CMD_EGMF; + break; + case NETDEV_DOWN: + action = (--ndp->inet6_addr_num == 0); + nca.type = NCSI_PKT_CMD_DGMF; + break; + default: + return NOTIFY_OK; + } + + /* We might not have active channel or packages. The IPv6 + * required multicast will be enabled when active channel + * or packages are chosen. + */ + np = ndp->active_package; + nc = ndp->active_channel; + if (!action || !np || !nc) + return NOTIFY_OK; + + /* We needn't enable or disable it if the function isn't supported */ + if (!(nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC)) + return NOTIFY_OK; + + nca.ndp = ndp; + nca.driven = false; + nca.package = np->id; + nca.channel = nc->id; + nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; + ret = ncsi_xmit_cmd(&nca); + if (ret) { + netdev_warn(dev, "Fail to %s global multicast filter (%d)\n", + (event == NETDEV_UP) ? "enable" : "disable", ret); + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +static struct notifier_block ncsi_inet6addr_notifier = { + .notifier_call = ncsi_inet6addr_event, +}; +#endif /* CONFIG_IPV6 */ + struct ncsi_dev *ncsi_register_dev(struct net_device *dev, void (*handler)(struct ncsi_dev *ndev)) { @@ -397,6 +1105,9 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, nd->state = ncsi_dev_state_registered; nd->dev = dev; nd->handler = handler; + ndp->pending_req_num = 0; + INIT_LIST_HEAD(&ndp->channel_queue); + INIT_WORK(&ndp->work, ncsi_dev_work); /* Initialize private NCSI device */ spin_lock_init(&ndp->lock); @@ -411,24 +1122,76 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, } spin_lock_irqsave(&ncsi_dev_lock, flags); +#if IS_ENABLED(CONFIG_IPV6) + ndp->inet6_addr_num = 0; + if (list_empty(&ncsi_dev_list)) + register_inet6addr_notifier(&ncsi_inet6addr_notifier); +#endif list_add_tail_rcu(&ndp->node, &ncsi_dev_list); spin_unlock_irqrestore(&ncsi_dev_lock, flags); + /* Register NCSI packet Rx handler */ + ndp->ptype.type = cpu_to_be16(ETH_P_NCSI); + ndp->ptype.func = ncsi_rcv_rsp; + ndp->ptype.dev = dev; + dev_add_pack(&ndp->ptype); + return nd; } EXPORT_SYMBOL_GPL(ncsi_register_dev); +int ncsi_start_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_package *np; + struct ncsi_channel *nc; + int old_state, ret; + + if (nd->state != ncsi_dev_state_registered && + nd->state != ncsi_dev_state_functional) + return -ENOTTY; + + if (!(ndp->flags & NCSI_DEV_PROBED)) { + nd->state = ncsi_dev_state_probe; + schedule_work(&ndp->work); + return 0; + } + + /* Reset channel's state and start over */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + old_state = xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + WARN_ON_ONCE(!list_empty(&nc->link) || + old_state == NCSI_CHANNEL_INVISIBLE); + } + } + + if (ndp->flags & NCSI_DEV_HWA) + ret = ncsi_enable_hwa(ndp); + else + ret = ncsi_choose_active_channel(ndp); + + return ret; +} +EXPORT_SYMBOL_GPL(ncsi_start_dev); + void ncsi_unregister_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); struct ncsi_package *np, *tmp; unsigned long flags; + dev_remove_pack(&ndp->ptype); + list_for_each_entry_safe(np, tmp, &ndp->packages, node) ncsi_remove_package(np); spin_lock_irqsave(&ncsi_dev_lock, flags); list_del_rcu(&ndp->node); +#if IS_ENABLED(CONFIG_IPV6) + if (list_empty(&ncsi_dev_list)) + unregister_inet6addr_notifier(&ncsi_inet6addr_notifier); +#endif spin_unlock_irqrestore(&ncsi_dev_lock, flags); kfree(ndp); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 6ec25cb2608c..a21af88330aa 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -69,6 +69,9 @@ static int ncsi_rsp_handler_cis(struct ncsi_request *nr) rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, &np, &nc); if (!nc) { + if (ndp->flags & NCSI_DEV_PROBED) + return -ENXIO; + id = NCSI_CHANNEL_INDEX(rsp->rsp.common.channel); nc = ncsi_add_channel(np, id); } @@ -90,6 +93,9 @@ static int ncsi_rsp_handler_sp(struct ncsi_request *nr) ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, &np, NULL); if (!np) { + if (ndp->flags & NCSI_DEV_PROBED) + return -ENXIO; + id = NCSI_PACKAGE_INDEX(rsp->rsp.common.channel); np = ncsi_add_package(ndp, id); if (!np) @@ -297,6 +303,7 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr) struct ncsi_dev_priv *ndp = nr->ndp; struct ncsi_channel *nc; struct ncsi_channel_mode *ncm; + unsigned long flags; /* Find the package and channel */ rsp = (struct ncsi_rsp_gls_pkt *)skb_network_header(nr->rsp); @@ -310,6 +317,14 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr) ncm->data[3] = ntohl(rsp->other); ncm->data[4] = ntohl(rsp->oem_status); + if (nr->driven) + return 0; + + /* Reset the channel monitor if it has been enabled */ + spin_lock_irqsave(&nc->lock, flags); + nc->timeout = 0; + spin_unlock_irqrestore(&nc->lock, flags); + return 0; } -- cgit v1.2.3 From 7a82ecf4cfb854955198945340ae13558b64e1af Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 19 Jul 2016 11:54:20 +1000 Subject: net/ncsi: NCSI AEN packet handler This introduces NCSI AEN packet handlers that result in (A) the currently active channel is reconfigured; (B) Currently active channel is deconfigured and disabled, another channel is chosen as active one and configured. Case (B) won't happen if hardware arbitration has been enabled, the channel that was in active state is suspended simply. Signed-off-by: Gavin Shan Acked-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/Makefile | 2 +- net/ncsi/internal.h | 1 + net/ncsi/ncsi-aen.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ncsi/ncsi-pkt.h | 36 ++++++++++ net/ncsi/ncsi-rsp.c | 6 +- 5 files changed, 236 insertions(+), 2 deletions(-) create mode 100644 net/ncsi/ncsi-aen.c (limited to 'net') diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile index 4751819e855b..dd12b564f2e7 100644 --- a/net/ncsi/Makefile +++ b/net/ncsi/Makefile @@ -1,4 +1,4 @@ # # Makefile for NCSI API # -obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-manage.o +obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-aen.o ncsi-manage.o diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 38fc95a26f8f..33738c060547 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -323,5 +323,6 @@ u32 ncsi_calculate_checksum(unsigned char *data, int len); int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca); int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); +int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb); #endif /* __NCSI_INTERNAL_H__ */ diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c new file mode 100644 index 000000000000..d463468442ae --- /dev/null +++ b/net/ncsi/ncsi-aen.c @@ -0,0 +1,193 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "ncsi-pkt.h" + +static int ncsi_validate_aen_pkt(struct ncsi_aen_pkt_hdr *h, + const unsigned short payload) +{ + u32 checksum; + __be32 *pchecksum; + + if (h->common.revision != NCSI_PKT_REVISION) + return -EINVAL; + if (ntohs(h->common.length) != payload) + return -EINVAL; + + /* Validate checksum, which might be zeroes if the + * sender doesn't support checksum according to NCSI + * specification. + */ + pchecksum = (__be32 *)((void *)(h + 1) + payload - 4); + if (ntohl(*pchecksum) == 0) + return 0; + + checksum = ncsi_calculate_checksum((unsigned char *)h, + sizeof(*h) + payload - 4); + if (*pchecksum != htonl(checksum)) + return -EINVAL; + + return 0; +} + +static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, + struct ncsi_aen_pkt_hdr *h) +{ + struct ncsi_aen_lsc_pkt *lsc; + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + unsigned long old_data; + unsigned long flags; + + /* Find the NCSI channel */ + ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc); + if (!nc) + return -ENODEV; + + /* Update the link status */ + ncm = &nc->modes[NCSI_MODE_LINK]; + lsc = (struct ncsi_aen_lsc_pkt *)h; + old_data = ncm->data[2]; + ncm->data[2] = ntohl(lsc->status); + ncm->data[4] = ntohl(lsc->oem_status); + if (!((old_data ^ ncm->data[2]) & 0x1) || + !list_empty(&nc->link)) + return 0; + if (!(nc->state == NCSI_CHANNEL_INACTIVE && (ncm->data[2] & 0x1)) && + !(nc->state == NCSI_CHANNEL_ACTIVE && !(ncm->data[2] & 0x1))) + return 0; + + if (!(ndp->flags & NCSI_DEV_HWA) && + nc->state == NCSI_CHANNEL_ACTIVE) + ndp->flags |= NCSI_DEV_RESHUFFLE; + + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + return ncsi_process_next_channel(ndp); +} + +static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp, + struct ncsi_aen_pkt_hdr *h) +{ + struct ncsi_channel *nc; + unsigned long flags; + + /* Find the NCSI channel */ + ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc); + if (!nc) + return -ENODEV; + + if (!list_empty(&nc->link) || + nc->state != NCSI_CHANNEL_ACTIVE) + return 0; + + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&ndp->lock, flags); + xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + return ncsi_process_next_channel(ndp); +} + +static int ncsi_aen_handler_hncdsc(struct ncsi_dev_priv *ndp, + struct ncsi_aen_pkt_hdr *h) +{ + struct ncsi_channel *nc; + struct ncsi_channel_mode *ncm; + struct ncsi_aen_hncdsc_pkt *hncdsc; + unsigned long flags; + + /* Find the NCSI channel */ + ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc); + if (!nc) + return -ENODEV; + + /* If the channel is active one, we need reconfigure it */ + ncm = &nc->modes[NCSI_MODE_LINK]; + hncdsc = (struct ncsi_aen_hncdsc_pkt *)h; + ncm->data[3] = ntohl(hncdsc->status); + if (!list_empty(&nc->link) || + nc->state != NCSI_CHANNEL_ACTIVE || + (ncm->data[3] & 0x1)) + return 0; + + if (ndp->flags & NCSI_DEV_HWA) + ndp->flags |= NCSI_DEV_RESHUFFLE; + + /* If this channel is the active one and the link doesn't + * work, we have to choose another channel to be active one. + * The logic here is exactly similar to what we do when link + * is down on the active channel. + */ + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + ncsi_process_next_channel(ndp); + + return 0; +} + +static struct ncsi_aen_handler { + unsigned char type; + int payload; + int (*handler)(struct ncsi_dev_priv *ndp, + struct ncsi_aen_pkt_hdr *h); +} ncsi_aen_handlers[] = { + { NCSI_PKT_AEN_LSC, 12, ncsi_aen_handler_lsc }, + { NCSI_PKT_AEN_CR, 4, ncsi_aen_handler_cr }, + { NCSI_PKT_AEN_HNCDSC, 4, ncsi_aen_handler_hncdsc } +}; + +int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb) +{ + struct ncsi_aen_pkt_hdr *h; + struct ncsi_aen_handler *nah = NULL; + int i, ret; + + /* Find the handler */ + h = (struct ncsi_aen_pkt_hdr *)skb_network_header(skb); + for (i = 0; i < ARRAY_SIZE(ncsi_aen_handlers); i++) { + if (ncsi_aen_handlers[i].type == h->type) { + nah = &ncsi_aen_handlers[i]; + break; + } + } + + if (!nah) { + netdev_warn(ndp->ndev.dev, "Invalid AEN (0x%x) received\n", + h->type); + return -ENOENT; + } + + ret = ncsi_validate_aen_pkt(h, nah->payload); + if (ret) + goto out; + + ret = nah->handler(ndp, h); +out: + consume_skb(skb); + return ret; +} diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index 4bdefd988354..3ea49ed0a935 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -31,6 +31,12 @@ struct ncsi_rsp_pkt_hdr { __be16 reason; /* Response reason */ }; +struct ncsi_aen_pkt_hdr { + struct ncsi_pkt_hdr common; /* Common NCSI packet header */ + unsigned char reserved2[3]; /* Reserved */ + unsigned char type; /* AEN packet type */ +}; + /* NCSI common command packet */ struct ncsi_cmd_pkt { struct ncsi_cmd_pkt_hdr cmd; /* Command header */ @@ -296,6 +302,30 @@ struct ncsi_rsp_gpuuid_pkt { __be32 checksum; }; +/* AEN: Link State Change */ +struct ncsi_aen_lsc_pkt { + struct ncsi_aen_pkt_hdr aen; /* AEN header */ + __be32 status; /* Link status */ + __be32 oem_status; /* OEM link status */ + __be32 checksum; /* Checksum */ + unsigned char pad[14]; +}; + +/* AEN: Configuration Required */ +struct ncsi_aen_cr_pkt { + struct ncsi_aen_pkt_hdr aen; /* AEN header */ + __be32 checksum; /* Checksum */ + unsigned char pad[22]; +}; + +/* AEN: Host Network Controller Driver Status Change */ +struct ncsi_aen_hncdsc_pkt { + struct ncsi_aen_pkt_hdr aen; /* AEN header */ + __be32 status; /* Status */ + __be32 checksum; /* Checksum */ + unsigned char pad[18]; +}; + /* NCSI packet revision */ #define NCSI_PKT_REVISION 0x01 @@ -376,4 +406,10 @@ struct ncsi_rsp_gpuuid_pkt { #define NCSI_PKT_RSP_R_LENGTH 0x0005 /* Invalid payload length */ #define NCSI_PKT_RSP_R_UNKNOWN 0x7fff /* Command type unsupported */ +/* NCSI AEN packet type */ +#define NCSI_PKT_AEN 0xFF /* AEN Packet */ +#define NCSI_PKT_AEN_LSC 0x00 /* Link status change */ +#define NCSI_PKT_AEN_CR 0x01 /* Configuration required */ +#define NCSI_PKT_AEN_HNCDSC 0x02 /* HNC driver status change */ + #endif /* __NCSI_PKT_H__ */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index a21af88330aa..af84389a6bf1 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -980,8 +980,12 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, if (!ndp) return -ENODEV; - /* Find the handler */ + /* Check if it is AEN packet */ hdr = (struct ncsi_pkt_hdr *)skb_network_header(skb); + if (hdr->type == NCSI_PKT_AEN) + return ncsi_aen_handler(ndp, skb); + + /* Find the handler */ for (i = 0; i < ARRAY_SIZE(ncsi_rsp_handlers); i++) { if (ncsi_rsp_handlers[i].type == hdr->type) { if (ncsi_rsp_handlers[i].handler) -- cgit v1.2.3 From 6a773a15a1e8874e5eccd2f29190c31085912c95 Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Tue, 19 Jul 2016 12:16:47 -0700 Subject: bpf: add XDP prog type for early driver filter Add a new bpf prog type that is intended to run in early stages of the packet rx path. Only minimal packet metadata will be available, hence a new context type, struct xdp_md, is exposed to userspace. So far only expose the packet start and end pointers, and only in read mode. An XDP program must return one of the well known enum values, all other return codes are reserved for future use. Unfortunately, this restriction is hard to enforce at verification time, so take the approach of warning at runtime when such programs are encountered. Out of bounds return codes should alias to XDP_ABORTED. Signed-off-by: Brenden Blanco Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 22e3992c8b48..6c627bc4be6e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2410,6 +2410,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +xdp_func_proto(enum bpf_func_id func_id) +{ + return sk_filter_func_proto(func_id); +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(struct __sk_buff)) @@ -2477,6 +2483,44 @@ static bool tc_cls_act_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool __is_valid_xdp_access(int off, int size, + enum bpf_access_type type) +{ + if (off < 0 || off >= sizeof(struct xdp_md)) + return false; + if (off % size != 0) + return false; + if (size != 4) + return false; + + return true; +} + +static bool xdp_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct xdp_md, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct xdp_md, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_xdp_access(off, size, type); +} + +void bpf_warn_invalid_xdp_action(u32 act) +{ + WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); +} +EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); + static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -2628,6 +2672,29 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct xdp_md, data): + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)), + dst_reg, src_reg, + offsetof(struct xdp_buff, data)); + break; + case offsetof(struct xdp_md, data_end): + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)), + dst_reg, src_reg, + offsetof(struct xdp_buff, data_end)); + break; + } + + return insn - insn_buf; +} + static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -2640,6 +2707,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = { .convert_ctx_access = bpf_net_convert_ctx_access, }; +static const struct bpf_verifier_ops xdp_ops = { + .get_func_proto = xdp_func_proto, + .is_valid_access = xdp_is_valid_access, + .convert_ctx_access = xdp_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -2655,11 +2728,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = { .type = BPF_PROG_TYPE_SCHED_ACT, }; +static struct bpf_prog_type_list xdp_type __read_mostly = { + .ops = &xdp_ops, + .type = BPF_PROG_TYPE_XDP, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); + bpf_register_prog_type(&xdp_type); return 0; } -- cgit v1.2.3 From a7862b45849fe2f8610a2bec89235580f55d337f Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Tue, 19 Jul 2016 12:16:48 -0700 Subject: net: add ndo to setup/query xdp prog in adapter rx Add one new netdev op for drivers implementing the BPF_PROG_TYPE_XDP filter. The single op is used for both setup/query of the xdp program, modelled after ndo_setup_tc. Signed-off-by: Brenden Blanco Signed-off-by: David S. Miller --- net/core/dev.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 7894e406c806..2a9c39f8824e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include #include @@ -6614,6 +6615,38 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); +/** + * dev_change_xdp_fd - set or clear a bpf program for a device rx path + * @dev: device + * @fd: new program fd or negative value to clear + * + * Set or clear a bpf program for a device + */ +int dev_change_xdp_fd(struct net_device *dev, int fd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct bpf_prog *prog = NULL; + struct netdev_xdp xdp = {}; + int err; + + if (!ops->ndo_xdp) + return -EOPNOTSUPP; + if (fd >= 0) { + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + + xdp.command = XDP_SETUP_PROG; + xdp.prog = prog; + err = ops->ndo_xdp(dev, &xdp); + if (err < 0 && prog) + bpf_prog_put(prog); + + return err; +} +EXPORT_SYMBOL(dev_change_xdp_fd); + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace -- cgit v1.2.3 From d1fdd9138682e0f272beee0cb08b6328c5478b26 Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Tue, 19 Jul 2016 12:16:49 -0700 Subject: rtnl: add option for setting link xdp prog Sets the bpf program represented by fd as an early filter in the rx path of the netdev. The fd must have been created as BPF_PROG_TYPE_XDP. Providing a negative value as fd clears the program. Getting the fd back via rtnl is not possible, therefore reading of this value merely provides a bool whether the program is valid on the link or not. Signed-off-by: Brenden Blanco Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a9e3805af739..eba2b8260dbd 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -891,6 +891,16 @@ static size_t rtnl_port_size(const struct net_device *dev, return port_self_size; } +static size_t rtnl_xdp_size(const struct net_device *dev) +{ + size_t xdp_size = nla_total_size(1); /* XDP_ATTACHED */ + + if (!dev->netdev_ops->ndo_xdp) + return 0; + else + return xdp_size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -927,6 +937,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + + rtnl_xdp_size(dev) /* IFLA_XDP */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1211,6 +1222,33 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct netdev_xdp xdp_op = {}; + struct nlattr *xdp; + int err; + + if (!dev->netdev_ops->ndo_xdp) + return 0; + xdp = nla_nest_start(skb, IFLA_XDP); + if (!xdp) + return -EMSGSIZE; + xdp_op.command = XDP_QUERY_PROG; + err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); + if (err) + goto err_cancel; + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, xdp_op.prog_attached); + if (err) + goto err_cancel; + + nla_nest_end(skb, xdp); + return 0; + +err_cancel: + nla_nest_cancel(skb, xdp); + return err; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -1307,6 +1345,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; + if (rtnl_xdp_fill(skb, dev)) + goto nla_put_failure; + if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; @@ -1392,6 +1433,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, + [IFLA_XDP] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1429,6 +1471,11 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, }; +static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { + [IFLA_XDP_FD] = { .type = NLA_S32 }, + [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, +}; + static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) { const struct rtnl_link_ops *ops = NULL; @@ -2054,6 +2101,23 @@ static int do_setlink(const struct sk_buff *skb, status |= DO_SETLINK_NOTIFY; } + if (tb[IFLA_XDP]) { + struct nlattr *xdp[IFLA_XDP_MAX + 1]; + + err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], + ifla_xdp_policy); + if (err < 0) + goto errout; + + if (xdp[IFLA_XDP_FD]) { + err = dev_change_xdp_fd(dev, + nla_get_s32(xdp[IFLA_XDP_FD])); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + } + errout: if (status & DO_SETLINK_MODIFIED) { if (status & DO_SETLINK_NOTIFY) -- cgit v1.2.3 From 82de0be6862cdca2e6802267bda57cfc8844d3a7 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Mon, 18 Jul 2016 11:39:23 +0800 Subject: netfilter: Add helper array register/unregister functions Add nf_ct_helper_init(), nf_conntrack_helpers_register() and nf_conntrack_helpers_unregister() functions to avoid repetitive opencoded initialization in helpers. This patch keeps an id parameter for nf_ct_helper_init() not to break helper matching by name that has been inconsistently exposed to userspace through ports, eg. ftp-2121, and through an incremental id, eg. tftp-1. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_ftp.c | 58 +++++++++------------------- net/netfilter/nf_conntrack_helper.c | 57 ++++++++++++++++++++++++++++ net/netfilter/nf_conntrack_irc.c | 36 ++++++------------ net/netfilter/nf_conntrack_sane.c | 57 ++++++++++------------------ net/netfilter/nf_conntrack_sip.c | 75 ++++++++++++++----------------------- net/netfilter/nf_conntrack_tftp.c | 48 ++++++++---------------- 6 files changed, 150 insertions(+), 181 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 19efeba02abb..43147005bea3 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -572,7 +572,7 @@ static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct) return 0; } -static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly; +static struct nf_conntrack_helper ftp[MAX_PORTS * 2] __read_mostly; static const struct nf_conntrack_expect_policy ftp_exp_policy = { .max_expected = 1, @@ -582,24 +582,13 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = { /* don't make this __exit, since it's called from __init ! */ static void nf_conntrack_ftp_fini(void) { - int i, j; - for (i = 0; i < ports_c; i++) { - for (j = 0; j < 2; j++) { - if (ftp[i][j].me == NULL) - continue; - - pr_debug("unregistering helper for pf: %d port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); - nf_conntrack_helper_unregister(&ftp[i][j]); - } - } - + nf_conntrack_helpers_unregister(ftp, ports_c * 2); kfree(ftp_buffer); } static int __init nf_conntrack_ftp_init(void) { - int i, j = -1, ret = 0; + int i, ret = 0; ftp_buffer = kmalloc(65536, GFP_KERNEL); if (!ftp_buffer) @@ -611,32 +600,21 @@ static int __init nf_conntrack_ftp_init(void) /* FIXME should be configurable whether IPv4 and IPv6 FTP connections are tracked or not - YK */ for (i = 0; i < ports_c; i++) { - ftp[i][0].tuple.src.l3num = PF_INET; - ftp[i][1].tuple.src.l3num = PF_INET6; - for (j = 0; j < 2; j++) { - ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master); - ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); - ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; - ftp[i][j].expect_policy = &ftp_exp_policy; - ftp[i][j].me = THIS_MODULE; - ftp[i][j].help = help; - ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr; - if (ports[i] == FTP_PORT) - sprintf(ftp[i][j].name, "ftp"); - else - sprintf(ftp[i][j].name, "ftp-%d", ports[i]); - - pr_debug("registering helper for pf: %d port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); - ret = nf_conntrack_helper_register(&ftp[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %d port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_ftp_fini(); - return ret; - } - } + nf_ct_helper_init(&ftp[2 * i], AF_INET, IPPROTO_TCP, "ftp", + FTP_PORT, ports[i], ports[i], &ftp_exp_policy, + 0, sizeof(struct nf_ct_ftp_master), help, + nf_ct_ftp_from_nlattr, THIS_MODULE); + nf_ct_helper_init(&ftp[2 * i + 1], AF_INET6, IPPROTO_TCP, "ftp", + FTP_PORT, ports[i], ports[i], &ftp_exp_policy, + 0, sizeof(struct nf_ct_ftp_master), help, + nf_ct_ftp_from_nlattr, THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(ftp, ports_c * 2); + if (ret < 0) { + pr_err("failed to register helpers\n"); + kfree(ftp_buffer); + return ret; } return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index a4294e949cdc..b989b81ac156 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -465,6 +465,63 @@ restart: } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); +void nf_ct_helper_init(struct nf_conntrack_helper *helper, + u16 l3num, u16 protonum, const char *name, + u16 default_port, u16 spec_port, u32 id, + const struct nf_conntrack_expect_policy *exp_pol, + u32 expect_class_max, u32 data_len, + int (*help)(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo), + int (*from_nlattr)(struct nlattr *attr, + struct nf_conn *ct), + struct module *module) +{ + helper->tuple.src.l3num = l3num; + helper->tuple.dst.protonum = protonum; + helper->tuple.src.u.all = htons(spec_port); + helper->expect_policy = exp_pol; + helper->expect_class_max = expect_class_max; + helper->data_len = data_len; + helper->help = help; + helper->from_nlattr = from_nlattr; + helper->me = module; + + if (spec_port == default_port) + snprintf(helper->name, sizeof(helper->name), "%s", name); + else + snprintf(helper->name, sizeof(helper->name), "%s-%u", name, id); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_init); + +int nf_conntrack_helpers_register(struct nf_conntrack_helper *helper, + unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = nf_conntrack_helper_register(&helper[i]); + if (err < 0) + goto err; + } + + return err; +err: + if (i > 0) + nf_conntrack_helpers_unregister(helper, i); + return err; +} +EXPORT_SYMBOL_GPL(nf_conntrack_helpers_register); + +void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *helper, + unsigned int n) +{ + while (n-- > 0) + nf_conntrack_helper_unregister(&helper[n]); +} +EXPORT_SYMBOL_GPL(nf_conntrack_helpers_unregister); + static struct nf_ct_ext_type helper_extend __read_mostly = { .len = sizeof(struct nf_conn_help), .align = __alignof__(struct nf_conn_help), diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index f97ac61d2536..1972a149f958 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -255,27 +255,18 @@ static int __init nf_conntrack_irc_init(void) ports[ports_c++] = IRC_PORT; for (i = 0; i < ports_c; i++) { - irc[i].tuple.src.l3num = AF_INET; - irc[i].tuple.src.u.tcp.port = htons(ports[i]); - irc[i].tuple.dst.protonum = IPPROTO_TCP; - irc[i].expect_policy = &irc_exp_policy; - irc[i].me = THIS_MODULE; - irc[i].help = help; - - if (ports[i] == IRC_PORT) - sprintf(irc[i].name, "irc"); - else - sprintf(irc[i].name, "irc-%u", i); - - ret = nf_conntrack_helper_register(&irc[i]); - if (ret) { - pr_err("failed to register helper for pf: %u port: %u\n", - irc[i].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_irc_fini(); - return ret; - } + nf_ct_helper_init(&irc[i], AF_INET, IPPROTO_TCP, "irc", + IRC_PORT, ports[i], i, &irc_exp_policy, + 0, 0, help, NULL, THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(&irc[0], ports_c); + if (ret) { + pr_err("failed to register helpers\n"); + kfree(irc_buffer); + return ret; } + return 0; } @@ -283,10 +274,7 @@ static int __init nf_conntrack_irc_init(void) * it is needed by the init function */ static void nf_conntrack_irc_fini(void) { - int i; - - for (i = 0; i < ports_c; i++) - nf_conntrack_helper_unregister(&irc[i]); + nf_conntrack_helpers_unregister(irc, ports_c); kfree(irc_buffer); } diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 3fcbaab83b3d..9dcb9ee9b97d 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -166,7 +166,7 @@ out: return ret; } -static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly; +static struct nf_conntrack_helper sane[MAX_PORTS * 2] __read_mostly; static const struct nf_conntrack_expect_policy sane_exp_policy = { .max_expected = 1, @@ -176,22 +176,13 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = { /* don't make this __exit, since it's called from __init ! */ static void nf_conntrack_sane_fini(void) { - int i, j; - - for (i = 0; i < ports_c; i++) { - for (j = 0; j < 2; j++) { - pr_debug("unregistering helper for pf: %d port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); - nf_conntrack_helper_unregister(&sane[i][j]); - } - } - + nf_conntrack_helpers_unregister(sane, ports_c * 2); kfree(sane_buffer); } static int __init nf_conntrack_sane_init(void) { - int i, j = -1, ret = 0; + int i, ret = 0; sane_buffer = kmalloc(65536, GFP_KERNEL); if (!sane_buffer) @@ -203,31 +194,23 @@ static int __init nf_conntrack_sane_init(void) /* FIXME should be configurable whether IPv4 and IPv6 connections are tracked or not - YK */ for (i = 0; i < ports_c; i++) { - sane[i][0].tuple.src.l3num = PF_INET; - sane[i][1].tuple.src.l3num = PF_INET6; - for (j = 0; j < 2; j++) { - sane[i][j].data_len = sizeof(struct nf_ct_sane_master); - sane[i][j].tuple.src.u.tcp.port = htons(ports[i]); - sane[i][j].tuple.dst.protonum = IPPROTO_TCP; - sane[i][j].expect_policy = &sane_exp_policy; - sane[i][j].me = THIS_MODULE; - sane[i][j].help = help; - if (ports[i] == SANE_PORT) - sprintf(sane[i][j].name, "sane"); - else - sprintf(sane[i][j].name, "sane-%d", ports[i]); - - pr_debug("registering helper for pf: %d port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); - ret = nf_conntrack_helper_register(&sane[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %d port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_sane_fini(); - return ret; - } - } + nf_ct_helper_init(&sane[2 * i], AF_INET, IPPROTO_TCP, "sane", + SANE_PORT, ports[i], ports[i], + &sane_exp_policy, 0, + sizeof(struct nf_ct_sane_master), help, NULL, + THIS_MODULE); + nf_ct_helper_init(&sane[2 * i + 1], AF_INET6, IPPROTO_TCP, "sane", + SANE_PORT, ports[i], ports[i], + &sane_exp_policy, 0, + sizeof(struct nf_ct_sane_master), help, NULL, + THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(sane, ports_c * 2); + if (ret < 0) { + pr_err("failed to register helpers\n"); + kfree(sane_buffer); + return ret; } return 0; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index f72ba5587588..8d9db9d4702b 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1589,7 +1589,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, return process_sip_msg(skb, ct, protoff, dataoff, &dptr, &datalen); } -static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; +static struct nf_conntrack_helper sip[MAX_PORTS * 4] __read_mostly; static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = { [SIP_EXPECT_SIGNALLING] = { @@ -1616,20 +1616,12 @@ static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1 static void nf_conntrack_sip_fini(void) { - int i, j; - - for (i = 0; i < ports_c; i++) { - for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { - if (sip[i][j].me == NULL) - continue; - nf_conntrack_helper_unregister(&sip[i][j]); - } - } + nf_conntrack_helpers_unregister(sip, ports_c * 4); } static int __init nf_conntrack_sip_init(void) { - int i, j, ret; + int i, ret; if (ports_c == 0) ports[ports_c++] = SIP_PORT; @@ -1637,43 +1629,32 @@ static int __init nf_conntrack_sip_init(void) for (i = 0; i < ports_c; i++) { memset(&sip[i], 0, sizeof(sip[i])); - sip[i][0].tuple.src.l3num = AF_INET; - sip[i][0].tuple.dst.protonum = IPPROTO_UDP; - sip[i][0].help = sip_help_udp; - sip[i][1].tuple.src.l3num = AF_INET; - sip[i][1].tuple.dst.protonum = IPPROTO_TCP; - sip[i][1].help = sip_help_tcp; - - sip[i][2].tuple.src.l3num = AF_INET6; - sip[i][2].tuple.dst.protonum = IPPROTO_UDP; - sip[i][2].help = sip_help_udp; - sip[i][3].tuple.src.l3num = AF_INET6; - sip[i][3].tuple.dst.protonum = IPPROTO_TCP; - sip[i][3].help = sip_help_tcp; - - for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { - sip[i][j].data_len = sizeof(struct nf_ct_sip_master); - sip[i][j].tuple.src.u.udp.port = htons(ports[i]); - sip[i][j].expect_policy = sip_exp_policy; - sip[i][j].expect_class_max = SIP_EXPECT_MAX; - sip[i][j].me = THIS_MODULE; - - if (ports[i] == SIP_PORT) - sprintf(sip[i][j].name, "sip"); - else - sprintf(sip[i][j].name, "sip-%u", i); - - pr_debug("port #%u: %u\n", i, ports[i]); + nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_udp, + NULL, THIS_MODULE); + nf_ct_helper_init(&sip[4 * i + 1], AF_INET, IPPROTO_TCP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_tcp, + NULL, THIS_MODULE); + nf_ct_helper_init(&sip[4 * i + 2], AF_INET6, IPPROTO_UDP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_udp, + NULL, THIS_MODULE); + nf_ct_helper_init(&sip[4 * i + 3], AF_INET6, IPPROTO_TCP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_tcp, + NULL, THIS_MODULE); + } - ret = nf_conntrack_helper_register(&sip[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %u port: %u\n", - sip[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_sip_fini(); - return ret; - } - } + ret = nf_conntrack_helpers_register(sip, ports_c * 4); + if (ret < 0) { + pr_err("failed to register helpers\n"); + return ret; } return 0; } diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 2e65b5430fba..b1227dc6f75e 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -97,7 +97,7 @@ static int tftp_help(struct sk_buff *skb, return ret; } -static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly; +static struct nf_conntrack_helper tftp[MAX_PORTS * 2] __read_mostly; static const struct nf_conntrack_expect_policy tftp_exp_policy = { .max_expected = 1, @@ -106,47 +106,29 @@ static const struct nf_conntrack_expect_policy tftp_exp_policy = { static void nf_conntrack_tftp_fini(void) { - int i, j; - - for (i = 0; i < ports_c; i++) { - for (j = 0; j < 2; j++) - nf_conntrack_helper_unregister(&tftp[i][j]); - } + nf_conntrack_helpers_unregister(tftp, ports_c * 2); } static int __init nf_conntrack_tftp_init(void) { - int i, j, ret; + int i, ret; if (ports_c == 0) ports[ports_c++] = TFTP_PORT; for (i = 0; i < ports_c; i++) { - memset(&tftp[i], 0, sizeof(tftp[i])); - - tftp[i][0].tuple.src.l3num = AF_INET; - tftp[i][1].tuple.src.l3num = AF_INET6; - for (j = 0; j < 2; j++) { - tftp[i][j].tuple.dst.protonum = IPPROTO_UDP; - tftp[i][j].tuple.src.u.udp.port = htons(ports[i]); - tftp[i][j].expect_policy = &tftp_exp_policy; - tftp[i][j].me = THIS_MODULE; - tftp[i][j].help = tftp_help; - - if (ports[i] == TFTP_PORT) - sprintf(tftp[i][j].name, "tftp"); - else - sprintf(tftp[i][j].name, "tftp-%u", i); - - ret = nf_conntrack_helper_register(&tftp[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %u port: %u\n", - tftp[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_tftp_fini(); - return ret; - } - } + nf_ct_helper_init(&tftp[2 * i], AF_INET, IPPROTO_UDP, "tftp", + TFTP_PORT, ports[i], i, &tftp_exp_policy, + 0, 0, tftp_help, NULL, THIS_MODULE); + nf_ct_helper_init(&tftp[2 * i + 1], AF_INET6, IPPROTO_UDP, "tftp", + TFTP_PORT, ports[i], i, &tftp_exp_policy, + 0, 0, tftp_help, NULL, THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(tftp, ports_c * 2); + if (ret < 0) { + pr_err("failed to register helpers\n"); + return ret; } return 0; } -- cgit v1.2.3 From c2d9a4293ced88d7dad7c35c893a31f49f8b64f5 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 18 Jul 2016 20:44:15 +0800 Subject: netfilter: nft_log: fix possible memory leak if log expr init fail Suppose that we specify the NFTA_LOG_PREFIX, then NFTA_LOG_LEVEL and NFTA_LOG_GROUP are specified together or nf_logger_find_get call returns fail, i.e. expr init fail, memory leak will happen. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_log.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 713d66837705..e1b34ff0ebd0 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -52,6 +52,14 @@ static int nft_log_init(const struct nft_ctx *ctx, struct nft_log *priv = nft_expr_priv(expr); struct nf_loginfo *li = &priv->loginfo; const struct nlattr *nla; + int err; + + li->type = NF_LOG_TYPE_LOG; + if (tb[NFTA_LOG_LEVEL] != NULL && + tb[NFTA_LOG_GROUP] != NULL) + return -EINVAL; + if (tb[NFTA_LOG_GROUP] != NULL) + li->type = NF_LOG_TYPE_ULOG; nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { @@ -63,13 +71,6 @@ static int nft_log_init(const struct nft_ctx *ctx, priv->prefix = (char *)nft_log_null_prefix; } - li->type = NF_LOG_TYPE_LOG; - if (tb[NFTA_LOG_LEVEL] != NULL && - tb[NFTA_LOG_GROUP] != NULL) - return -EINVAL; - if (tb[NFTA_LOG_GROUP] != NULL) - li->type = NF_LOG_TYPE_ULOG; - switch (li->type) { case NF_LOG_TYPE_LOG: if (tb[NFTA_LOG_LEVEL] != NULL) { @@ -96,7 +97,16 @@ static int nft_log_init(const struct nft_ctx *ctx, break; } - return nf_logger_find_get(ctx->afi->family, li->type); + err = nf_logger_find_get(ctx->afi->family, li->type); + if (err < 0) + goto err1; + + return 0; + +err1: + if (priv->prefix != nft_log_null_prefix) + kfree(priv->prefix); + return err; } static void nft_log_destroy(const struct nft_ctx *ctx, -- cgit v1.2.3 From 1bc4e0136cb32282d7968e11cfabc40763fdb03c Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 18 Jul 2016 20:44:16 +0800 Subject: netfilter: nft_log: check the validity of log level User can specify the log level larger than 7(debug level) via nfnetlink, this is invalid. So in this case, we should report EINVAL to the userspace. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_log.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index e1b34ff0ebd0..5f6f088ff06e 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -79,6 +79,11 @@ static int nft_log_init(const struct nft_ctx *ctx, } else { li->u.log.level = LOGLEVEL_WARNING; } + if (li->u.log.level > LOGLEVEL_DEBUG) { + err = -EINVAL; + goto err1; + } + if (tb[NFTA_LOG_FLAGS] != NULL) { li->u.log.logflags = ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS])); -- cgit v1.2.3 From cc37c1ad42ba6bc07c3d7a999f898e11d69a2580 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 18 Jul 2016 20:44:17 +0800 Subject: netfilter: nft_log: fix snaplen does not truncate packets There's a similar problem in xt_NFLOG, and was fixed by commit 7643507fe8b5 ("netfilter: xt_NFLOG: nflog-range does not truncate packets"). Only set copy_len here does not work, so we should enable NF_LOG_F_COPY_LEN also. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_log.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 5f6f088ff06e..24a73bb26e94 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -92,6 +92,7 @@ static int nft_log_init(const struct nft_ctx *ctx, case NF_LOG_TYPE_ULOG: li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP])); if (tb[NFTA_LOG_SNAPLEN] != NULL) { + li->u.ulog.flags |= NF_LOG_F_COPY_LEN; li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); } @@ -149,7 +150,7 @@ static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) goto nla_put_failure; - if (li->u.ulog.copy_len) { + if (li->u.ulog.flags & NF_LOG_F_COPY_LEN) { if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, htonl(li->u.ulog.copy_len))) goto nla_put_failure; -- cgit v1.2.3 From 6e1f760e13c75eb0c21c75c6eed918e25b54cd07 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Jul 2016 12:20:45 +0200 Subject: netfilter: nf_tables: allow to filter out rules by table and chain If the table and/or chain attributes are set in a rule dump request, we filter out the rules based on this selection. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0211eaec9060..13d50e7cfe2f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1857,10 +1857,16 @@ err: return err; } +struct nft_rule_dump_ctx { + char table[NFT_TABLE_MAXNAMELEN]; + char chain[NFT_CHAIN_MAXNAMELEN]; +}; + static int nf_tables_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nft_rule_dump_ctx *ctx = cb->data; const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; @@ -1877,7 +1883,15 @@ static int nf_tables_dump_rules(struct sk_buff *skb, continue; list_for_each_entry_rcu(table, &afi->tables, list) { + if (ctx && ctx->table[0] && + strcmp(ctx->table, table->name) != 0) + continue; + list_for_each_entry_rcu(chain, &table->chains, list) { + if (ctx && ctx->chain[0] && + strcmp(ctx->chain, chain->name) != 0) + continue; + list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_is_active(net, rule)) goto cont; @@ -1907,6 +1921,12 @@ done: return skb->len; } +static int nf_tables_dump_rules_done(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + static int nf_tables_getrule(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -1924,7 +1944,25 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_rules, + .done = nf_tables_dump_rules_done, }; + + if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) { + struct nft_rule_dump_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (nla[NFTA_RULE_TABLE]) + nla_strlcpy(ctx->table, nla[NFTA_RULE_TABLE], + sizeof(ctx->table)); + if (nla[NFTA_RULE_CHAIN]) + nla_strlcpy(ctx->chain, nla[NFTA_RULE_CHAIN], + sizeof(ctx->chain)); + c.data = ctx; + } + return netlink_dump_start(nlsk, skb, nlh, &c); } -- cgit v1.2.3 From 262d8625045e0c81b7859ecd192e9811710f19da Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Wed, 20 Jul 2016 17:22:34 -0700 Subject: rtnl: protect do_setlink from IFLA_XDP_ATTACHED The IFLA_XDP_ATTACHED nested attribute is meant for read-only, and while do_setlink properly ignores it, it should be more paranoid and reject commands that try to set it. Signed-off-by: Brenden Blanco Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index eba2b8260dbd..189cc78c77eb 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2109,6 +2109,10 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; + if (xdp[IFLA_XDP_ATTACHED]) { + err = -EINVAL; + goto errout; + } if (xdp[IFLA_XDP_FD]) { err = dev_change_xdp_fd(dev, nla_get_s32(xdp[IFLA_XDP_FD])); -- cgit v1.2.3 From 23014011ba4209a086931ff402eac1c41abbe456 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 Jul 2016 12:51:16 +0200 Subject: netfilter: conntrack: support a fixed size of 128 distinct labels The conntrack label extension is currently variable-sized, e.g. if only 2 labels are used by iptables rules then the labels->bits[] array will only contain one element. We track size of each label storage area in the 'words' member. But in nftables and openvswitch we always have to ask for worst-case since we don't know what bit will be used at configuration time. As most arches are 64bit we need to allocate 24 bytes in this case: struct nf_conn_labels { u8 words; /* 0 1 */ /* XXX 7 bytes hole, try to pack */ long unsigned bits[2]; /* 8 24 */ Make bits a fixed size and drop the words member, it simplifies the code and only increases memory requirements on x86 when less than 64bit labels are required. We still only allocate the extension if its needed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_labels.c | 13 +++---------- net/netfilter/nf_conntrack_netlink.c | 10 +++++----- net/netfilter/nft_ct.c | 13 +++---------- net/netfilter/xt_connlabel.c | 2 +- net/openvswitch/conntrack.c | 4 ++-- 5 files changed, 14 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 252e6a7cd2f1..7686200f9ace 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -20,7 +20,7 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); - if (!labels || BIT_WORD(bit) >= labels->words) + if (!labels) return -ENOSPC; if (test_bit(bit, labels->bits)) @@ -60,7 +60,7 @@ int nf_connlabels_replace(struct nf_conn *ct, if (!labels) return -ENOSPC; - size = labels->words * sizeof(long); + size = sizeof(labels->bits); if (size < (words32 * sizeof(u32))) words32 = size / sizeof(u32); @@ -80,16 +80,11 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace); int nf_connlabels_get(struct net *net, unsigned int bits) { - size_t words; - - words = BIT_WORD(bits) + 1; - if (words > NF_CT_LABELS_MAX_SIZE / sizeof(long)) + if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long)) return -ERANGE; spin_lock(&nf_connlabels_lock); net->ct.labels_used++; - if (words > net->ct.label_words) - net->ct.label_words = words; spin_unlock(&nf_connlabels_lock); return 0; @@ -100,8 +95,6 @@ void nf_connlabels_put(struct net *net) { spin_lock(&nf_connlabels_lock); net->ct.labels_used--; - if (net->ct.labels_used == 0) - net->ct.label_words = 0; spin_unlock(&nf_connlabels_lock); } EXPORT_SYMBOL_GPL(nf_connlabels_put); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index a18d1ceabad5..050bb3420a6b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -346,25 +346,25 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct) if (!labels) return 0; - return nla_total_size(labels->words * sizeof(long)); + return nla_total_size(sizeof(labels->bits)); } static int ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); - unsigned int len, i; + unsigned int i; if (!labels) return 0; - len = labels->words * sizeof(long); i = 0; do { if (labels->bits[i] != 0) - return nla_put(skb, CTA_LABELS, len, labels->bits); + return nla_put(skb, CTA_LABELS, sizeof(labels->bits), + labels->bits); i++; - } while (i < labels->words); + } while (i < ARRAY_SIZE(labels->bits)); return 0; } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index d9e44ca34055..2f47d5d3ae3b 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -113,18 +113,11 @@ static void nft_ct_get_eval(const struct nft_expr *expr, #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { struct nf_conn_labels *labels = nf_ct_labels_find(ct); - unsigned int size; - if (!labels) { + if (labels) + memcpy(dest, labels->bits, NF_CT_LABELS_MAX_SIZE); + else memset(dest, 0, NF_CT_LABELS_MAX_SIZE); - return; - } - - size = labels->words * sizeof(long); - memcpy(dest, labels->bits, size); - if (size < NF_CT_LABELS_MAX_SIZE) - memset(((char *) dest) + size, 0, - NF_CT_LABELS_MAX_SIZE - size); return; } #endif diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index a79af255561a..c9fba8ade3d5 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -25,7 +25,7 @@ static bool connlabel_match(const struct nf_conn *ct, u16 bit) if (!labels) return false; - return BIT_WORD(bit) < labels->words && test_bit(bit, labels->bits); + return test_bit(bit, labels->bits); } static bool diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index b4069a90e375..c644c78ed485 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -135,7 +135,7 @@ static void ovs_ct_get_labels(const struct nf_conn *ct, struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; if (cl) { - size_t len = cl->words * sizeof(long); + size_t len = sizeof(cl->bits); if (len > OVS_CT_LABELS_LEN) len = OVS_CT_LABELS_LEN; @@ -274,7 +274,7 @@ static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, nf_ct_labels_ext_add(ct); cl = nf_ct_labels_find(ct); } - if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN) + if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN) return -ENOSPC; err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, -- cgit v1.2.3 From 857ed310c013fe0d0059f955048dab589fa7a57a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 Jul 2016 12:51:17 +0200 Subject: netfilter: connlabels: move set helper to xt_connlabel xt_connlabel is the only user so move it. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_labels.c | 17 ----------------- net/netfilter/xt_connlabel.c | 29 ++++++++++++++++------------- 2 files changed, 16 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 7686200f9ace..bcab8bde7312 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -16,23 +16,6 @@ static spinlock_t nf_connlabels_lock; -int nf_connlabel_set(struct nf_conn *ct, u16 bit) -{ - struct nf_conn_labels *labels = nf_ct_labels_find(ct); - - if (!labels) - return -ENOSPC; - - if (test_bit(bit, labels->bits)) - return 0; - - if (!test_and_set_bit(bit, labels->bits)) - nf_conntrack_event_cache(IPCT_LABEL, ct); - - return 0; -} -EXPORT_SYMBOL_GPL(nf_connlabel_set); - static int replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index c9fba8ade3d5..03d66f1c5e69 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -18,21 +19,12 @@ MODULE_DESCRIPTION("Xtables: add/match connection trackling labels"); MODULE_ALIAS("ipt_connlabel"); MODULE_ALIAS("ip6t_connlabel"); -static bool connlabel_match(const struct nf_conn *ct, u16 bit) -{ - struct nf_conn_labels *labels = nf_ct_labels_find(ct); - - if (!labels) - return false; - - return test_bit(bit, labels->bits); -} - static bool connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connlabel_mtinfo *info = par->matchinfo; enum ip_conntrack_info ctinfo; + struct nf_conn_labels *labels; struct nf_conn *ct; bool invert = info->options & XT_CONNLABEL_OP_INVERT; @@ -40,10 +32,21 @@ connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ct == NULL || nf_ct_is_untracked(ct)) return invert; - if (info->options & XT_CONNLABEL_OP_SET) - return (nf_connlabel_set(ct, info->bit) == 0) ^ invert; + labels = nf_ct_labels_find(ct); + if (!labels) + return invert; + + if (test_bit(info->bit, labels->bits)) + return !invert; + + if (info->options & XT_CONNLABEL_OP_SET) { + if (!test_and_set_bit(info->bit, labels->bits)) + nf_conntrack_event_cache(IPCT_LABEL, ct); + + return !invert; + } - return connlabel_match(ct, info->bit) ^ invert; + return invert; } static int connlabel_mt_check(const struct xt_mtchk_param *par) -- cgit v1.2.3 From 96d1327ac2e3dc3ac4204fe3656dad0043fc0efd Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Fri, 22 Jul 2016 12:59:15 +0800 Subject: netfilter: h323: Use mod_timer instead of set_expect_timeout Simplify the code without any side effect. The set_expect_timeout is used to modify the timer expired time. It tries to delete timer, and add it again. So we could use mod_timer directly. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_h323_main.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 9511af04dc81..bb77a97961bf 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1272,19 +1272,6 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, return NULL; } -/****************************************************************************/ -static int set_expect_timeout(struct nf_conntrack_expect *exp, - unsigned int timeout) -{ - if (!exp || !del_timer(&exp->timeout)) - return 0; - - exp->timeout.expires = jiffies + timeout * HZ; - add_timer(&exp->timeout); - - return 1; -} - /****************************************************************************/ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -1486,7 +1473,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, "timeout to %u seconds for", info->timeout); nf_ct_dump_tuple(&exp->tuple); - set_expect_timeout(exp, info->timeout); + mod_timer(&exp->timeout, jiffies + info->timeout * HZ); } spin_unlock_bh(&nf_conntrack_expect_lock); } -- cgit v1.2.3 From 2bf4fade54de995f84d319ae02003609dca450f3 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 23 Jul 2016 16:00:31 +0800 Subject: netfilter: nft_compat: put back match/target module if init fail If the user specify the invalid NFTA_MATCH_INFO/NFTA_TARGET_INFO attr or memory alloc fail, we should call module_put to the related match or target. Otherwise, we cannot remove the module even nobody use it. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 6228c422c766..2e07cec50ffd 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -634,6 +634,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, struct xt_match *match; char *mt_name; u32 rev, family; + int err; if (tb[NFTA_MATCH_NAME] == NULL || tb[NFTA_MATCH_REV] == NULL || @@ -660,13 +661,17 @@ nft_match_select_ops(const struct nft_ctx *ctx, if (IS_ERR(match)) return ERR_PTR(-ENOENT); - if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO])) - return ERR_PTR(-EINVAL); + if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO])) { + err = -EINVAL; + goto err; + } /* This is the first time we use this match, allocate operations */ nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); - if (nft_match == NULL) - return ERR_PTR(-ENOMEM); + if (nft_match == NULL) { + err = -ENOMEM; + goto err; + } nft_match->ops.type = &nft_match_type; nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize)); @@ -680,6 +685,9 @@ nft_match_select_ops(const struct nft_ctx *ctx, list_add(&nft_match->head, &nft_match_list); return &nft_match->ops; +err: + module_put(match->me); + return ERR_PTR(err); } static void nft_match_release(void) @@ -717,6 +725,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, struct xt_target *target; char *tg_name; u32 rev, family; + int err; if (tb[NFTA_TARGET_NAME] == NULL || tb[NFTA_TARGET_REV] == NULL || @@ -743,13 +752,17 @@ nft_target_select_ops(const struct nft_ctx *ctx, if (IS_ERR(target)) return ERR_PTR(-ENOENT); - if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) - return ERR_PTR(-EINVAL); + if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) { + err = -EINVAL; + goto err; + } /* This is the first time we use this target, allocate operations */ nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); - if (nft_target == NULL) - return ERR_PTR(-ENOMEM); + if (nft_target == NULL) { + err = -ENOMEM; + goto err; + } nft_target->ops.type = &nft_target_type; nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); @@ -767,6 +780,9 @@ nft_target_select_ops(const struct nft_ctx *ctx, list_add(&nft_target->head, &nft_target_list); return &nft_target->ops; +err: + module_put(target->me); + return ERR_PTR(err); } static void nft_target_release(void) -- cgit v1.2.3 From 4b512e1c1f8de6b9ceb796ecef8658e0a083cab7 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 23 Jul 2016 16:00:32 +0800 Subject: netfilter: nft_compat: fix crash when related match/target module is removed We "cache" the loaded match/target modules and reuse them, but when the modules are removed, we still point to them. Then we may end up with invalid memory references when using iptables-compat to add rules later. Input the following commands will reproduce the kernel crash: # iptables-compat -A INPUT -j LOG # iptables-compat -D INPUT -j LOG # rmmod xt_LOG # iptables-compat -A INPUT -j LOG BUG: unable to handle kernel paging request at ffffffffa05a9010 IP: [] strcmp+0xe/0x30 Call Trace: [] nft_target_select_ops+0x83/0x1f0 [nft_compat] [] nf_tables_expr_parse+0x147/0x1f0 [nf_tables] [] nf_tables_newrule+0x301/0x810 [nf_tables] [] ? nla_parse+0x20/0x100 [] nfnetlink_rcv+0x33f/0x53d [nfnetlink] [] ? nfnetlink_rcv+0x1fb/0x53d [nfnetlink] [] netlink_unicast+0x178/0x220 [] netlink_sendmsg+0x2fb/0x3a0 [] sock_sendmsg+0x38/0x50 [] ___sys_sendmsg+0x28e/0x2a0 [] ? release_sock+0x1e/0xb0 [] ? _raw_spin_unlock_bh+0x35/0x40 [] ? release_sock+0x82/0xb0 [] __sys_sendmsg+0x54/0x90 [] SyS_sendmsg+0x12/0x20 [] entry_SYSCALL_64_fastpath+0x1a/0xa9 So when nobody use the related match/target module, there's no need to "cache" it. And nft_[match|target]_release are useless anymore, remove them. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 2e07cec50ffd..c21e7eb8dce0 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -23,6 +23,20 @@ #include #include +struct nft_xt { + struct list_head head; + struct nft_expr_ops ops; + unsigned int refcnt; +}; + +static void nft_xt_put(struct nft_xt *xt) +{ + if (--xt->refcnt == 0) { + list_del(&xt->head); + kfree(xt); + } +} + static int nft_compat_chain_validate_dependency(const char *tablename, const struct nft_chain *chain) { @@ -260,6 +274,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) if (par.target->destroy != NULL) par.target->destroy(&par); + nft_xt_put(container_of(expr->ops, struct nft_xt, ops)); module_put(target->me); } @@ -442,6 +457,7 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) if (par.match->destroy != NULL) par.match->destroy(&par); + nft_xt_put(container_of(expr->ops, struct nft_xt, ops)); module_put(match->me); } @@ -612,11 +628,6 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = { static LIST_HEAD(nft_match_list); -struct nft_xt { - struct list_head head; - struct nft_expr_ops ops; -}; - static struct nft_expr_type nft_match_type; static bool nft_match_cmp(const struct xt_match *match, @@ -653,6 +664,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, if (!try_module_get(match->me)) return ERR_PTR(-ENOENT); + nft_match->refcnt++; return &nft_match->ops; } } @@ -673,6 +685,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, goto err; } + nft_match->refcnt = 1; nft_match->ops.type = &nft_match_type; nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize)); nft_match->ops.eval = nft_match_eval; @@ -690,14 +703,6 @@ err: return ERR_PTR(err); } -static void nft_match_release(void) -{ - struct nft_xt *nft_match, *tmp; - - list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head) - kfree(nft_match); -} - static struct nft_expr_type nft_match_type __read_mostly = { .name = "match", .select_ops = nft_match_select_ops, @@ -744,6 +749,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, if (!try_module_get(target->me)) return ERR_PTR(-ENOENT); + nft_target->refcnt++; return &nft_target->ops; } } @@ -764,6 +770,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, goto err; } + nft_target->refcnt = 1; nft_target->ops.type = &nft_target_type; nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); nft_target->ops.init = nft_target_init; @@ -785,14 +792,6 @@ err: return ERR_PTR(err); } -static void nft_target_release(void) -{ - struct nft_xt *nft_target, *tmp; - - list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head) - kfree(nft_target); -} - static struct nft_expr_type nft_target_type __read_mostly = { .name = "target", .select_ops = nft_target_select_ops, @@ -835,8 +834,6 @@ static void __exit nft_compat_module_exit(void) nfnetlink_subsys_unregister(&nfnl_compat_subsys); nft_unregister_expr(&nft_target_type); nft_unregister_expr(&nft_match_type); - nft_match_release(); - nft_target_release(); } MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); -- cgit v1.2.3 From bf3994d2ed310813da28362d87bfe9f0e1c3e37f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 21 Jul 2016 12:03:11 +0200 Subject: net/sched: introduce Match-all classifier The matchall classifier matches every packet and allows the user to apply actions on it. This filter is very useful in usecases where every packet should be matched, for example, packet mirroring (SPAN) can be setup very easily using that filter. Signed-off-by: Jiri Pirko Signed-off-by: Yotam Gigi Signed-off-by: David S. Miller --- net/sched/Kconfig | 10 ++ net/sched/Makefile | 1 + net/sched/cls_matchall.c | 248 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 net/sched/cls_matchall.c (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index b148302bbaf2..ccf931b3b94c 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -494,6 +494,16 @@ config NET_CLS_FLOWER To compile this code as a module, choose M here: the module will be called cls_flower. +config NET_CLS_MATCHALL + tristate "Match-all classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + nothing. Every packet will match. + + To compile this code as a module, choose M here: the module will + be called cls_matchall. + config NET_EMATCH bool "Extended Matches" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 84bddb373517..ae088a5a9d95 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -60,6 +60,7 @@ obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o obj-$(CONFIG_NET_CLS_FLOWER) += cls_flower.o +obj-$(CONFIG_NET_CLS_MATCHALL) += cls_matchall.o obj-$(CONFIG_NET_EMATCH) += ematch.o obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c new file mode 100644 index 000000000000..8a6b4de7a99a --- /dev/null +++ b/net/sched/cls_matchall.c @@ -0,0 +1,248 @@ +/* + * net/sched/cls_matchll.c Match-all classifier + * + * Copyright (c) 2016 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include + +#include +#include + +struct cls_mall_filter { + struct tcf_exts exts; + struct tcf_result res; + u32 handle; + struct rcu_head rcu; +}; + +struct cls_mall_head { + struct cls_mall_filter *filter; + struct rcu_head rcu; +}; + +static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_mall_head *head = rcu_dereference_bh(tp->root); + struct cls_mall_filter *f = head->filter; + + return tcf_exts_exec(skb, &f->exts, res); +} + +static int mall_init(struct tcf_proto *tp) +{ + struct cls_mall_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + return -ENOBUFS; + + rcu_assign_pointer(tp->root, head); + + return 0; +} + +static void mall_destroy_filter(struct rcu_head *head) +{ + struct cls_mall_filter *f = container_of(head, struct cls_mall_filter, rcu); + + tcf_exts_destroy(&f->exts); + kfree(f); +} + +static bool mall_destroy(struct tcf_proto *tp, bool force) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + + if (!force && head->filter) + return false; + + if (head->filter) + call_rcu(&head->filter->rcu, mall_destroy_filter); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); + return true; +} + +static unsigned long mall_get(struct tcf_proto *tp, u32 handle) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct cls_mall_filter *f = head->filter; + + if (f && f->handle == handle) + return (unsigned long) f; + return 0; +} + +static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { + [TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC }, + [TCA_MATCHALL_CLASSID] = { .type = NLA_U32 }, +}; + +static int mall_set_parms(struct net *net, struct tcf_proto *tp, + struct cls_mall_filter *f, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct tcf_exts e; + int err; + + tcf_exts_init(&e, TCA_MATCHALL_ACT, 0); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + return err; + + if (tb[TCA_MATCHALL_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +} + +static int mall_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg, bool ovr) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct cls_mall_filter *fold = (struct cls_mall_filter *) *arg; + struct cls_mall_filter *f; + struct nlattr *tb[TCA_MATCHALL_MAX + 1]; + int err; + + if (!tca[TCA_OPTIONS]) + return -EINVAL; + + if (head->filter) + return -EBUSY; + + if (fold) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_MATCHALL_MAX, + tca[TCA_OPTIONS], mall_policy); + if (err < 0) + return err; + + f = kzalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOBUFS; + + tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0); + + if (!handle) + handle = 1; + f->handle = handle; + + err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); + if (err) + goto errout; + + *arg = (unsigned long) f; + rcu_assign_pointer(head->filter, f); + + return 0; + +errout: + kfree(f); + return err; +} + +static int mall_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct cls_mall_filter *f = (struct cls_mall_filter *) arg; + + RCU_INIT_POINTER(head->filter, NULL); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, mall_destroy_filter); + return 0; +} + +static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct cls_mall_filter *f = head->filter; + + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long) f, arg) < 0) + arg->stop = 1; +skip: + arg->count++; +} + +static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_mall_filter *f = (struct cls_mall_filter *) fh; + struct nlattr *nest; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + if (f->res.classid && + nla_put_u32(skb, TCA_MATCHALL_CLASSID, f->res.classid)) + goto nla_put_failure; + + if (tcf_exts_dump(skb, &f->exts)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct tcf_proto_ops cls_mall_ops __read_mostly = { + .kind = "matchall", + .classify = mall_classify, + .init = mall_init, + .destroy = mall_destroy, + .get = mall_get, + .change = mall_change, + .delete = mall_delete, + .walk = mall_walk, + .dump = mall_dump, + .owner = THIS_MODULE, +}; + +static int __init cls_mall_init(void) +{ + return register_tcf_proto_ops(&cls_mall_ops); +} + +static void __exit cls_mall_exit(void) +{ + unregister_tcf_proto_ops(&cls_mall_ops); +} + +module_init(cls_mall_init); +module_exit(cls_mall_exit); + +MODULE_AUTHOR("Jiri Pirko "); +MODULE_DESCRIPTION("Match-all classifier"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From b87f7936a93246804cf70e7e2e0568799c948bb1 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Thu, 21 Jul 2016 12:03:12 +0200 Subject: net/sched: Add match-all classifier hw offloading. Following the work that have been done on offloading classifiers like u32 and flower, now the match-all classifier hw offloading is possible. if the interface supports tc offloading. To control the offloading, two tc flags have been introduced: skip_sw and skip_hw. Typical usage: tc filter add dev eth25 parent ffff: \ matchall skip_sw \ action mirred egress mirror \ dev eth27 Signed-off-by: Yotam Gigi Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 8a6b4de7a99a..25927b6c4436 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,6 +21,7 @@ struct cls_mall_filter { struct tcf_result res; u32 handle; struct rcu_head rcu; + u32 flags; }; struct cls_mall_head { @@ -34,6 +35,9 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_mall_head *head = rcu_dereference_bh(tp->root); struct cls_mall_filter *f = head->filter; + if (tc_skip_sw(f->flags)) + return -1; + return tcf_exts_exec(skb, &f->exts, res); } @@ -55,18 +59,61 @@ static void mall_destroy_filter(struct rcu_head *head) struct cls_mall_filter *f = container_of(head, struct cls_mall_filter, rcu); tcf_exts_destroy(&f->exts); + kfree(f); } +static int mall_replace_hw_filter(struct tcf_proto *tp, + struct cls_mall_filter *f, + unsigned long cookie) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_to_netdev offload; + struct tc_cls_matchall_offload mall_offload = {0}; + + offload.type = TC_SETUP_MATCHALL; + offload.cls_mall = &mall_offload; + offload.cls_mall->command = TC_CLSMATCHALL_REPLACE; + offload.cls_mall->exts = &f->exts; + offload.cls_mall->cookie = cookie; + + return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, + &offload); +} + +static void mall_destroy_hw_filter(struct tcf_proto *tp, + struct cls_mall_filter *f, + unsigned long cookie) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_to_netdev offload; + struct tc_cls_matchall_offload mall_offload = {0}; + + offload.type = TC_SETUP_MATCHALL; + offload.cls_mall = &mall_offload; + offload.cls_mall->command = TC_CLSMATCHALL_DESTROY; + offload.cls_mall->exts = NULL; + offload.cls_mall->cookie = cookie; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, + &offload); +} + static bool mall_destroy(struct tcf_proto *tp, bool force) { struct cls_mall_head *head = rtnl_dereference(tp->root); + struct net_device *dev = tp->q->dev_queue->dev; + struct cls_mall_filter *f = head->filter; - if (!force && head->filter) + if (!force && f) return false; - if (head->filter) - call_rcu(&head->filter->rcu, mall_destroy_filter); + if (f) { + if (tc_should_offload(dev, tp, f->flags)) + mall_destroy_hw_filter(tp, f, (unsigned long) f); + + call_rcu(&f->rcu, mall_destroy_filter); + } RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); return true; @@ -117,8 +164,10 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, { struct cls_mall_head *head = rtnl_dereference(tp->root); struct cls_mall_filter *fold = (struct cls_mall_filter *) *arg; + struct net_device *dev = tp->q->dev_queue->dev; struct cls_mall_filter *f; struct nlattr *tb[TCA_MATCHALL_MAX + 1]; + u32 flags = 0; int err; if (!tca[TCA_OPTIONS]) @@ -135,6 +184,12 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; + if (tb[TCA_MATCHALL_FLAGS]) { + flags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]); + if (!tc_flags_valid(flags)) + return -EINVAL; + } + f = kzalloc(sizeof(*f), GFP_KERNEL); if (!f) return -ENOBUFS; @@ -144,11 +199,22 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (!handle) handle = 1; f->handle = handle; + f->flags = flags; err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); if (err) goto errout; + if (tc_should_offload(dev, tp, flags)) { + err = mall_replace_hw_filter(tp, f, (unsigned long) f); + if (err) { + if (tc_skip_sw(flags)) + goto errout; + else + err = 0; + } + } + *arg = (unsigned long) f; rcu_assign_pointer(head->filter, f); @@ -163,6 +229,10 @@ static int mall_delete(struct tcf_proto *tp, unsigned long arg) { struct cls_mall_head *head = rtnl_dereference(tp->root); struct cls_mall_filter *f = (struct cls_mall_filter *) arg; + struct net_device *dev = tp->q->dev_queue->dev; + + if (tc_should_offload(dev, tp, f->flags)) + mall_destroy_hw_filter(tp, f, (unsigned long) f); RCU_INIT_POINTER(head->filter, NULL); tcf_unbind_filter(tp, &f->res); -- cgit v1.2.3 From dba479f3d60a9d1f90458a4c6eb0754ecd22348c Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 21 Jul 2016 12:42:10 -0400 Subject: net: bridge: fix br_stp_enable_bridge comment br_stp_enable_bridge() does take the br->lock spinlock. Fix its wrongly pasted comment and use the same as br_stp_disable_bridge(). Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/bridge/br_stp_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 984d46263007..341caa0ca63a 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -55,7 +55,7 @@ void br_init_port(struct net_bridge_port *p) netdev_err(p->dev, "failed to set HW ageing time\n"); } -/* called under bridge lock */ +/* NO locks held */ void br_stp_enable_bridge(struct net_bridge *br) { struct net_bridge_port *p; -- cgit v1.2.3 From 9e0b27fe5ada7752577f3e1260eec44e79476142 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 21 Jul 2016 12:42:19 -0400 Subject: net: bridge: br_set_ageing_time takes a clock_t Change the ageing_time type in br_set_ageing_time() from u32 to what it is expected to be, i.e. a clock_t. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/bridge/br_private.h | 2 +- net/bridge/br_stp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b3088264f844..aac2a6e6b008 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -975,7 +975,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t); int br_set_forward_delay(struct net_bridge *br, unsigned long x); int br_set_hello_time(struct net_bridge *br, unsigned long x); int br_set_max_age(struct net_bridge *br, unsigned long x); -int br_set_ageing_time(struct net_bridge *br, u32 ageing_time); +int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time); /* br_stp_if.c */ diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 9cb7044d0801..9258b8ef14ff 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -570,7 +570,7 @@ int br_set_max_age(struct net_bridge *br, unsigned long val) * * Offloaded switch entries maybe more restrictive */ -int br_set_ageing_time(struct net_bridge *br, u32 ageing_time) +int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) { struct switchdev_attr attr = { .orig_dev = br->dev, -- cgit v1.2.3 From a1b43eddaec5a3fea55e1581caf217abda2d3147 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 21 Jul 2016 21:28:34 +0200 Subject: net/ncsi: avoid maybe-uninitialized warning gcc-4.9 and higher warn about the newly added NSCI code: net/ncsi/ncsi-manage.c: In function 'ncsi_process_next_channel': net/ncsi/ncsi-manage.c:1003:2: error: 'old_state' may be used uninitialized in this function [-Werror=maybe-uninitialized] The warning is a false positive and therefore harmless, but it would be good to avoid it anyway. I have determined that the barrier in the spin_unlock_irqsave() is what confuses gcc to the point that it cannot track whether the variable was unused or not. This rearranges the code in a way that makes it obvious to gcc that old_state is always initialized at the time of use, functionally this should not change anything. Signed-off-by: Arnd Bergmann Acked-by: Gavin Shan Signed-off-by: David S. Miller --- net/ncsi/ncsi-manage.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index d627a39ddcd0..ef017b871857 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -982,23 +982,18 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) spin_lock_irqsave(&ndp->lock, flags); nc = list_first_or_null_rcu(&ndp->channel_queue, struct ncsi_channel, link); - if (nc) { - old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE); - list_del_init(&nc->link); + if (!nc) { + spin_unlock_irqrestore(&ndp->lock, flags); + goto out; } + + old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE); + list_del_init(&nc->link); + spin_unlock_irqrestore(&ndp->lock, flags); ndp->active_channel = nc; - ndp->active_package = nc ? nc->package : NULL; - if (!nc) { - if (ndp->flags & NCSI_DEV_RESHUFFLE) { - ndp->flags &= ~NCSI_DEV_RESHUFFLE; - return ncsi_choose_active_channel(ndp); - } - - ncsi_report_link(ndp, false); - return -ENODEV; - } + ndp->active_package = nc->package; switch (old_state) { case NCSI_CHANNEL_INACTIVE: @@ -1017,6 +1012,17 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) } return 0; + +out: + ndp->active_channel = NULL; + ndp->active_package = NULL; + if (ndp->flags & NCSI_DEV_RESHUFFLE) { + ndp->flags &= ~NCSI_DEV_RESHUFFLE; + return ncsi_choose_active_channel(ndp); + } + + ncsi_report_link(ndp, false); + return -ENODEV; } #if IS_ENABLED(CONFIG_IPV6) -- cgit v1.2.3 From aa7145c16d6bf086538ad7eb20c807513bfa5efc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Jul 2016 01:19:42 +0200 Subject: bpf, events: fix offset in skb copy handler This patch fixes the __output_custom() routine we currently use with bpf_skb_copy(). I missed that when len is larger than the size of the current handle, we can issue multiple invocations of copy_func, and __output_custom() advances destination but also source buffer by the written amount of bytes. When we have __output_custom(), this is actually wrong since in that case the source buffer points to a non-linear object, in our case an skb, which the copy_func helper is supposed to walk. Therefore, since this is non-linear we thus need to pass the offset into the helper, so that copy_func can use it for extracting the data from the source object. Therefore, adjust the callback signatures properly and pass offset into the skb_header_pointer() invoked from bpf_skb_copy() callback. The __DEFINE_OUTPUT_COPY_BODY() is adjusted to accommodate for two things: i) to pass in whether we should advance source buffer or not; this is a compile-time constant condition, ii) to pass in the offset for __output_custom(), which we do with help of __VA_ARGS__, so everything can stay inlined as is currently. Both changes allow for adapting the __output_* fast-path helpers w/o extra overhead. Fixes: 555c8a8623a3 ("bpf: avoid stack copy and use skb ctx for event output") Fixes: 7e3f977edd0b ("perf, events: add non-linear data support for raw records") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 0b521353008d..5708999f8a79 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2026,9 +2026,9 @@ bool bpf_helper_changes_skb_data(void *func) } static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, - unsigned long len) + unsigned long off, unsigned long len) { - void *ptr = skb_header_pointer(skb, 0, len, dst_buff); + void *ptr = skb_header_pointer(skb, off, len, dst_buff); if (unlikely(!ptr)) return len; -- cgit v1.2.3 From 9b97420228881e839b76c8a4506da3cb187bf004 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 22 Jul 2016 17:38:51 +0800 Subject: sctp: support ipv6 nonlocal bind This patch makes sctp support ipv6 nonlocal bind by adding sp->inet.freebind and net->ipv6.sysctl.ip_nonlocal_bind check in sctp_v6_available as what sctp did to support ipv4 nonlocal bind (commit cdac4e077489). Reported-by: Shijoe George Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ae6f1a2178ba..660c4a4cac31 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -560,6 +560,7 @@ static int sctp_v6_is_any(const union sctp_addr *addr) static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) { int type; + struct net *net = sock_net(&sp->inet.sk); const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr; type = ipv6_addr_type(in6); @@ -574,7 +575,8 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) if (!(type & IPV6_ADDR_UNICAST)) return 0; - return ipv6_chk_addr(sock_net(&sp->inet.sk), in6, NULL, 0); + return sp->inet.freebind || net->ipv6.sysctl.ip_nonlocal_bind || + ipv6_chk_addr(net, in6, NULL, 0); } /* This function checks if the address is a valid address to be used for -- cgit v1.2.3 From baedbe55884c003819f5c8c063ec3d2569414296 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 22 Jul 2016 14:56:20 +0300 Subject: bridge: Fix incorrect re-injection of LLDP packets Commit 8626c56c8279 ("bridge: fix potential use-after-free when hook returns QUEUE or STOLEN verdict") caused LLDP packets arriving through a bridge port to be re-injected to the Rx path with skb->dev set to the bridge device, but this breaks the lldpad daemon. The lldpad daemon opens a packet socket with protocol set to ETH_P_LLDP for any valid device on the system, which doesn't not include soft devices such as bridge and VLAN. Since packet sockets (ptype_base) are processed in the Rx path after the Rx handler, LLDP packets with skb->dev set to the bridge device never reach the lldpad daemon. Fix this by making the bridge's Rx handler re-inject LLDP packets with RX_HANDLER_PASS, which effectively restores the behaviour prior to the mentioned commit. This means netfilter will never receive LLDP packets coming through a bridge port, as I don't see a way in which we can have okfn() consume the packet without breaking existing behaviour. I've already carried out a similar fix for STP packets in commit 56fae404fb2c ("bridge: Fix incorrect re-injection of STP packets"). Fixes: 8626c56c8279 ("bridge: fix potential use-after-free when hook returns QUEUE or STOLEN verdict") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Cc: Florian Westphal Cc: John Fastabend Signed-off-by: David S. Miller --- net/bridge/br_input.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8b08eec763a5..8e486203d133 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -283,6 +283,14 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) case 0x01: /* IEEE MAC (Pause) */ goto drop; + case 0x0E: /* 802.1AB LLDP */ + fwd_mask |= p->br->group_fwd_mask; + if (fwd_mask & (1u << dest[5])) + goto forward; + *pskb = skb; + __br_handle_local_finish(skb); + return RX_HANDLER_PASS; + default: /* Allow selective forwarding for most other protocols */ fwd_mask |= p->br->group_fwd_mask; -- cgit v1.2.3 From fd2d180a28cb5075163945d0b229926ec9782ab0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 22 Jul 2016 21:25:42 +0800 Subject: sctp: use inet_recvmsg to support sctp RFS well Commit 486bdee0134c ("sctp: add support for RPS and RFS") saves skb->hash into sk->sk_rxhash so that the inet_* can record it to flow table. But sctp uses sock_common_recvmsg as .recvmsg instead of inet_recvmsg, sock_common_recvmsg doesn't invoke sock_rps_record_flow to record the flow. It may cause that the receiver has no chances to record the flow if it doesn't send msg or poll the socket. So this patch fixes it by using inet_recvmsg as .recvmsg in sctp. Fixes: 486bdee0134c ("sctp: add support for RPS and RFS") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 2 +- net/sctp/protocol.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 660c4a4cac31..f473779e8b1c 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -956,7 +956,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, + .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 1adb9270e317..7b523e3f551f 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1028,7 +1028,7 @@ static const struct proto_ops inet_seqpacket_ops = { .setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */ .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, + .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT -- cgit v1.2.3 From ea06f7176413e2538d13bb85b65387d0917943d9 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Fri, 22 Jul 2016 18:32:11 +0100 Subject: net: ipv6: Always leave anycast and multicast groups on link down Default kernel behavior is to delete IPv6 addresses on link down, which entails deletion of the multicast and the subnet-router anycast addresses. These deletions do not happen with sysctl setting to keep global IPv6 addresses on link down, so every link down/up causes an increment of the anycast and multicast refcounts. These bogus refcounts may stop these addrs from being removed on subsequent calls to delete them. The solution is to leave the groups for the multicast and subnet anycast on link down for the callflow when global IPv6 addresses are kept. Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Signed-off-by: Mike Manning Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 24f1b0898e40..6287a8b9f428 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3636,6 +3636,10 @@ restart: if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); + } else { + if (idev->cnf.forwarding) + addrconf_leave_anycast(ifa); + addrconf_leave_solict(ifa->idev, &ifa->addr); } write_lock_bh(&idev->lock); -- cgit v1.2.3 From 0a58f474928cbace609fb563295ecb32491b1c4a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 22 Jul 2016 19:04:12 +0100 Subject: kcm: remove redundant -ve error check and return path The check for a -ve error is redundant, remove it and just immediately return the return value from the call to seq_open_net. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/kcm/kcmproc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index fda7f4715c58..16c2e03bd388 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -88,13 +88,9 @@ struct kcm_proc_mux_state { static int kcm_seq_open(struct inode *inode, struct file *file) { struct kcm_seq_muxinfo *muxinfo = PDE_DATA(inode); - int err; - err = seq_open_net(inode, file, &muxinfo->seq_ops, + return seq_open_net(inode, file, &muxinfo->seq_ops, sizeof(struct kcm_proc_mux_state)); - if (err < 0) - return err; - return err; } static void kcm_format_mux_header(struct seq_file *seq) -- cgit v1.2.3 From eefc1b1d105ee4d2ce907833ce675f1e9599b5e3 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Sat, 23 Jul 2016 00:32:48 -0300 Subject: sctp: fix BH handling on socket backlog Now that the backlog processing is called with BH enabled, we have to disable BH before taking the socket lock via bh_lock_sock() otherwise it may dead lock: sctp_backlog_rcv() bh_lock_sock(sk); if (sock_owned_by_user(sk)) { if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) sctp_chunk_free(chunk); else backloged = 1; } else sctp_inq_push(inqueue, chunk); bh_unlock_sock(sk); while sctp_inq_push() was disabling/enabling BH, but enabling BH triggers pending softirq, which then may try to re-lock the socket in sctp_rcv(). [ 219.187215] [ 219.187217] [] _raw_spin_lock+0x20/0x30 [ 219.187223] [] sctp_rcv+0x48c/0xba0 [sctp] [ 219.187225] [] ? nf_iterate+0x62/0x80 [ 219.187226] [] ip_local_deliver_finish+0x94/0x1e0 [ 219.187228] [] ip_local_deliver+0x6f/0xf0 [ 219.187229] [] ? ip_rcv_finish+0x3b0/0x3b0 [ 219.187230] [] ip_rcv_finish+0xd8/0x3b0 [ 219.187232] [] ip_rcv+0x282/0x3a0 [ 219.187233] [] ? update_curr+0x66/0x180 [ 219.187235] [] __netif_receive_skb_core+0x524/0xa90 [ 219.187236] [] ? update_cfs_shares+0x30/0xf0 [ 219.187237] [] ? __enqueue_entity+0x6c/0x70 [ 219.187239] [] ? enqueue_entity+0x204/0xdf0 [ 219.187240] [] __netif_receive_skb+0x18/0x60 [ 219.187242] [] process_backlog+0x9e/0x140 [ 219.187243] [] net_rx_action+0x22c/0x370 [ 219.187245] [] __do_softirq+0x112/0x2e7 [ 219.187247] [] do_softirq_own_stack+0x1c/0x30 [ 219.187247] [ 219.187248] [] do_softirq.part.14+0x38/0x40 [ 219.187249] [] __local_bh_enable_ip+0x7d/0x80 [ 219.187254] [] sctp_inq_push+0x68/0x80 [sctp] [ 219.187258] [] sctp_backlog_rcv+0x151/0x1c0 [sctp] [ 219.187260] [] __release_sock+0x87/0xf0 [ 219.187261] [] release_sock+0x30/0xa0 [ 219.187265] [] sctp_accept+0x17d/0x210 [sctp] [ 219.187266] [] ? prepare_to_wait_event+0xf0/0xf0 [ 219.187268] [] inet_accept+0x3c/0x130 [ 219.187269] [] SYSC_accept4+0x103/0x210 [ 219.187271] [] ? _raw_spin_unlock_bh+0x1a/0x20 [ 219.187272] [] ? release_sock+0x8c/0xa0 [ 219.187276] [] ? sctp_inet_listen+0x62/0x1b0 [sctp] [ 219.187277] [] SyS_accept+0x10/0x20 Fixes: 860fbbc343bf ("sctp: prepare for socket backlog behavior change") Cc: Eric Dumazet Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 2 ++ net/sctp/inqueue.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 30d72f7707b6..c182db7d691f 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -321,6 +321,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) */ sk = rcvr->sk; + local_bh_disable(); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -332,6 +333,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) sctp_inq_push(inqueue, chunk); bh_unlock_sock(sk); + local_bh_enable(); /* If the chunk was backloged again, don't drop refs */ if (backloged) diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 942770675f4c..c30ddb0f3190 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -89,12 +89,10 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk) * Eventually, we should clean up inqueue to not rely * on the BH related data structures. */ - local_bh_disable(); list_add_tail(&chunk->list, &q->in_chunk_list); if (chunk->asoc) chunk->asoc->stats.ipackets++; q->immediate.func(&q->immediate); - local_bh_enable(); } /* Peek at the next chunk on the inqeue. */ -- cgit v1.2.3 From 52253db924d1480bf2543afbb9551de31381aab9 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Sat, 23 Jul 2016 00:33:44 -0300 Subject: sctp: also point GSO head_skb to the sk when it's available The head skb for GSO packets won't travel through the inner depths of SCTP stack as it doesn't contain any chunks on it. That means skb->sk doesn't get set and then when sctp_recvmsg() calls sctp_inet6_skb_msgname() on the head_skb it panics, as this last needs to check flags at the socket (sp->v4mapped). The fix is to initialize skb->sk for th head skb once we are able to do it. That is, when the first chunk is processed. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ulpevent.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index f6219b164b42..1bc4f71aaba8 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -91,6 +91,7 @@ int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event) static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, const struct sctp_association *asoc) { + struct sctp_chunk *chunk = event->chunk; struct sk_buff *skb; /* Cast away the const, as we are just wanting to @@ -101,6 +102,8 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, event->asoc = (struct sctp_association *)asoc; atomic_add(event->rmem_len, &event->asoc->rmem_alloc); sctp_skb_set_owner_r(skb, asoc->base.sk); + if (chunk && chunk->head_skb && !chunk->head_skb->sk) + chunk->head_skb->sk = asoc->base.sk; } /* A simple destructor to give up the reference to the association. */ -- cgit v1.2.3 From d3e6952cfb7ba5f4bfa29d4803ba91f96ce1204d Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 23 Jul 2016 07:43:50 +0200 Subject: net/irda: fix NULL pointer dereference on memory allocation failure I ran into this: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 2 PID: 2012 Comm: trinity-c3 Not tainted 4.7.0-rc7+ #19 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff8800b745f2c0 ti: ffff880111740000 task.ti: ffff880111740000 RIP: 0010:[] [] irttp_connect_request+0x36/0x710 RSP: 0018:ffff880111747bb8 EFLAGS: 00010286 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000069dd8358 RDX: 0000000000000009 RSI: 0000000000000027 RDI: 0000000000000048 RBP: ffff880111747c00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000069dd8358 R11: 1ffffffff0759723 R12: 0000000000000000 R13: ffff88011a7e4780 R14: 0000000000000027 R15: 0000000000000000 FS: 00007fc738404700(0000) GS:ffff88011af00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc737fdfb10 CR3: 0000000118087000 CR4: 00000000000006e0 Stack: 0000000000000200 ffff880111747bd8 ffffffff810ee611 ffff880119f1f220 ffff880119f1f4f8 ffff880119f1f4f0 ffff88011a7e4780 ffff880119f1f232 ffff880119f1f220 ffff880111747d58 ffffffff82bca542 0000000000000000 Call Trace: [] irda_connect+0x562/0x1190 [] SYSC_connect+0x202/0x2a0 [] SyS_connect+0x9/0x10 [] do_syscall_64+0x19c/0x410 [] entry_SYSCALL64_slow_path+0x25/0x25 Code: 41 89 ca 48 89 e5 41 57 41 56 41 55 41 54 41 89 d7 53 48 89 fb 48 83 c7 48 48 89 fa 41 89 f6 48 c1 ea 03 48 83 ec 20 4c 8b 65 10 <0f> b6 04 02 84 c0 74 08 84 c0 0f 8e 4c 04 00 00 80 7b 48 00 74 RIP [] irttp_connect_request+0x36/0x710 RSP ---[ end trace 4cda2588bc055b30 ]--- The problem is that irda_open_tsap() can fail and leave self->tsap = NULL, and then irttp_connect_request() almost immediately dereferences it. Cc: stable@vger.kernel.org Signed-off-by: Vegard Nossum Signed-off-by: David S. Miller --- net/irda/af_irda.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 923abd6b3064..8d2f7c9b491d 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1024,8 +1024,11 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr, } /* Check if we have opened a local TSAP */ - if (!self->tsap) - irda_open_tsap(self, LSAP_ANY, addr->sir_name); + if (!self->tsap) { + err = irda_open_tsap(self, LSAP_ANY, addr->sir_name); + if (err) + goto out; + } /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; -- cgit v1.2.3 From 5fc382d87517707ad77ea4c9c12e2a3fde2c838a Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 23 Jul 2016 09:42:35 +0200 Subject: net/sctp: terminate rhashtable walk correctly I was seeing a lot of these: BUG: sleeping function called from invalid context at mm/slab.h:388 in_atomic(): 0, irqs_disabled(): 0, pid: 14971, name: trinity-c2 Preemption disabled at:[] rhashtable_walk_start+0x46/0x150 [] preempt_count_add+0x1fb/0x280 [] _raw_spin_lock+0x12/0x40 [] console_unlock+0x2f7/0x930 [] vprintk_emit+0x2fb/0x520 [] vprintk_default+0x1a/0x20 [] printk+0x94/0xb0 [] print_stack_trace+0xe0/0x170 [] ___might_sleep+0x3be/0x460 [] __might_sleep+0x90/0x1a0 [] kmem_cache_alloc+0x153/0x1e0 [] rhashtable_walk_init+0xfe/0x2d0 [] sctp_transport_walk_start+0x1e/0x60 [] sctp_transport_seq_start+0x4d/0x150 [] seq_read+0x27b/0x1180 [] proc_reg_read+0xbc/0x180 [] __vfs_read+0xdb/0x610 [] vfs_read+0xea/0x2d0 [] SyS_pread64+0x11b/0x150 [] do_syscall_64+0x19c/0x410 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff Apparently we always need to call rhashtable_walk_stop(), even when rhashtable_walk_start() fails: * rhashtable_walk_start - Start a hash table walk * @iter: Hash table iterator * * Start a hash table walk. Note that we take the RCU lock in all * cases including when we return an error. So you must always call * rhashtable_walk_stop to clean up. otherwise we never call rcu_read_unlock() and we get the splat above. Fixes: 53fa1036 ("sctp: fix some rhashtable functions using in sctp proc/diag") See-also: 53fa1036 ("sctp: fix some rhashtable functions using in sctp proc/diag") See-also: f2dba9c6 ("rhashtable: Introduce rhashtable_walk_*") Cc: Xin Long Cc: Herbert Xu Cc: stable@vger.kernel.org Signed-off-by: Vegard Nossum Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d2681cb1dd30..8812e1bf6c1c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4393,6 +4393,7 @@ int sctp_transport_walk_start(struct rhashtable_iter *iter) err = rhashtable_walk_start(iter); if (err && err != -EAGAIN) { + rhashtable_walk_stop(iter); rhashtable_walk_exit(iter); return err; } -- cgit v1.2.3 From ba66bbe5480a012108958a71cff88b23dce84956 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Jul 2016 18:06:12 +0200 Subject: udp: use sk_filter_trim_cap for udp{,6}_queue_rcv_skb After a612769774a3 ("udp: prevent bugcheck if filter truncates packet too much"), there followed various other fixes for similar cases such as f4979fcea7fd ("rose: limit sk_filter trim to payload"). Latter introduced a new helper sk_filter_trim_cap(), where we can pass the trim limit directly to the socket filter handling. Make use of it here as well with sizeof(struct udphdr) as lower cap limit and drop the extra skb->len test in UDP's input path. Signed-off-by: Daniel Borkmann Cc: Willem de Bruijn Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 +--- net/ipv6/udp.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4aed8fc23d32..e61f7cd65d08 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1581,9 +1581,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) udp_lib_checksum_complete(skb)) goto csum_error; - if (sk_filter(sk, skb)) - goto drop; - if (unlikely(skb->len < sizeof(struct udphdr))) + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) goto drop; udp_csum_pull_header(skb); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ad5292be17bd..81e2f98b958d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -618,9 +618,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) udp_lib_checksum_complete(skb)) goto csum_error; - if (sk_filter(sk, skb)) - goto drop; - if (unlikely(skb->len < sizeof(struct udphdr))) + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) goto drop; udp_csum_pull_header(skb); -- cgit v1.2.3 From a85a970af265f156740977168b542234511b28a8 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 25 Jul 2016 16:09:41 -0700 Subject: net_sched: move tc_action into tcf_common struct tc_action is confusing, currently we use it for two purposes: 1) Pass in arguments and carry out results from helper functions 2) A generic representation for tc actions The first one is error-prone, since we need to make sure we don't miss anything. This patch aims to get rid of this use, by moving tc_action into tcf_common, so that they are allocated together in hashtable and can be cast'ed easily. And together with the following patch, we could really make tc_action a generic representation for all tc actions and each type of action can inherit from it. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_api.c | 149 +++++++++++++++++------------------------------ net/sched/act_bpf.c | 26 +++++---- net/sched/act_connmark.c | 24 ++++---- net/sched/act_csum.c | 22 +++---- net/sched/act_gact.c | 24 ++++---- net/sched/act_ife.c | 38 ++++++------ net/sched/act_ipt.c | 48 ++++++++------- net/sched/act_mirred.c | 26 +++++---- net/sched/act_nat.c | 22 +++---- net/sched/act_pedit.c | 28 ++++----- net/sched/act_police.c | 45 +++++++------- net/sched/act_simple.c | 29 ++++----- net/sched/act_skbedit.c | 26 +++++---- net/sched/act_vlan.c | 28 ++++----- 14 files changed, 256 insertions(+), 279 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 47ec2305f920..d97419f35e7e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -38,7 +38,7 @@ static void free_tcf(struct rcu_head *head) static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a) { - struct tcf_common *p = a->priv; + struct tcf_common *p = (struct tcf_common *)a; spin_lock_bh(&hinfo->lock); hlist_del(&p->tcfc_head); @@ -54,7 +54,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a) int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) { - struct tcf_common *p = a->priv; + struct tcf_common *p = (struct tcf_common *)a; int ret = 0; if (p) { @@ -67,6 +67,7 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { if (a->ops->cleanup) a->ops->cleanup(a, bind); + list_del(&a->list); tcf_hash_destroy(a->hinfo, a); ret = ACT_P_DELETED; } @@ -77,10 +78,8 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) EXPORT_SYMBOL(__tcf_hash_release); static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, - struct netlink_callback *cb, struct tc_action *a) + struct netlink_callback *cb) { - struct hlist_head *head; - struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; @@ -89,19 +88,20 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, s_i = cb->args[0]; for (i = 0; i < (hinfo->hmask + 1); i++) { + struct hlist_head *head; + struct tcf_common *p; + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; hlist_for_each_entry_rcu(p, head, tcfc_head) { index++; if (index < s_i) continue; - a->priv = p; - a->order = n_i; - nest = nla_nest_start(skb, a->order); + nest = nla_nest_start(skb, n_i); if (nest == NULL) goto nla_put_failure; - err = tcf_action_dump_1(skb, a, 0, 0); + err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0); if (err < 0) { index--; nlmsg_trim(skb, nest); @@ -125,27 +125,27 @@ nla_put_failure: } static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, - struct tc_action *a) + const struct tc_action_ops *ops) { - struct hlist_head *head; - struct hlist_node *n; - struct tcf_common *p; struct nlattr *nest; int i = 0, n_i = 0; int ret = -EINVAL; - nest = nla_nest_start(skb, a->order); + nest = nla_nest_start(skb, 0); if (nest == NULL) goto nla_put_failure; - if (nla_put_string(skb, TCA_KIND, a->ops->kind)) + if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; for (i = 0; i < (hinfo->hmask + 1); i++) { + struct hlist_head *head; + struct hlist_node *n; + struct tcf_common *p; + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; hlist_for_each_entry_safe(p, n, head, tcfc_head) { - a->priv = p; - ret = __tcf_hash_release(a, false, true); + ret = __tcf_hash_release((struct tc_action *)p, false, true); if (ret == ACT_P_DELETED) { - module_put(a->ops->owner); + module_put(p->tcfc_act.ops->owner); n_i++; } else if (ret < 0) goto nla_put_failure; @@ -163,16 +163,14 @@ nla_put_failure: int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tcf_hashinfo *hinfo = tn->hinfo; - a->hinfo = hinfo; - if (type == RTM_DELACTION) { - return tcf_del_walker(hinfo, skb, a); + return tcf_del_walker(hinfo, skb, ops); } else if (type == RTM_GETACTION) { - return tcf_dump_walker(hinfo, skb, cb, a); + return tcf_dump_walker(hinfo, skb, cb); } else { WARN(1, "tcf_generic_walker: unknown action %d\n", type); return -EINVAL; @@ -210,21 +208,20 @@ u32 tcf_hash_new_index(struct tc_action_net *tn) } EXPORT_SYMBOL(tcf_hash_new_index); -int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index) +int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { struct tcf_hashinfo *hinfo = tn->hinfo; struct tcf_common *p = tcf_hash_lookup(index, hinfo); if (p) { - a->priv = p; - a->hinfo = hinfo; + *a = &p->tcfc_act; return 1; } return 0; } EXPORT_SYMBOL(tcf_hash_search); -bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, +bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a, int bind) { struct tcf_hashinfo *hinfo = tn->hinfo; @@ -233,8 +230,7 @@ bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, if (bind) p->tcfc_bindcnt++; p->tcfc_refcnt++; - a->priv = p; - a->hinfo = hinfo; + *a = &p->tcfc_act; return true; } return false; @@ -243,7 +239,7 @@ EXPORT_SYMBOL(tcf_hash_check); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) { - struct tcf_common *pc = a->priv; + struct tcf_common *pc = (struct tcf_common *)a; if (est) gen_kill_estimator(&pc->tcfc_bstats, &pc->tcfc_rate_est); @@ -252,9 +248,10 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) EXPORT_SYMBOL(tcf_hash_cleanup); int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, - struct tc_action *a, int size, int bind, bool cpustats) + struct tc_action **a, const struct tc_action_ops *ops, + int bind, bool cpustats) { - struct tcf_common *p = kzalloc(size, GFP_KERNEL); + struct tcf_common *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_hashinfo *hinfo = tn->hinfo; int err = -ENOMEM; @@ -294,15 +291,17 @@ err2: } } - a->priv = (void *) p; - a->hinfo = hinfo; + p->tcfc_act.hinfo = hinfo; + p->tcfc_act.ops = ops; + INIT_LIST_HEAD(&p->tcfc_act.list); + *a = &p->tcfc_act; return 0; } EXPORT_SYMBOL(tcf_hash_create); void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a) { - struct tcf_common *p = a->priv; + struct tcf_common *p = (struct tcf_common *)a; struct tcf_hashinfo *hinfo = tn->hinfo; unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); @@ -315,10 +314,6 @@ EXPORT_SYMBOL(tcf_hash_insert); void tcf_hashinfo_destroy(const struct tc_action_ops *ops, struct tcf_hashinfo *hinfo) { - struct tc_action a = { - .ops = ops, - .hinfo = hinfo, - }; int i; for (i = 0; i < hinfo->hmask + 1; i++) { @@ -328,8 +323,7 @@ void tcf_hashinfo_destroy(const struct tc_action_ops *ops, hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) { int ret; - a.priv = p; - ret = __tcf_hash_release(&a, false, true); + ret = __tcf_hash_release((struct tc_action *)p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) @@ -466,8 +460,6 @@ int tcf_action_destroy(struct list_head *actions, int bind) module_put(a->ops->owner); else if (ret < 0) return ret; - list_del(&a->list); - kfree(a); } return ret; } @@ -581,20 +573,13 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, goto err_out; } - err = -ENOMEM; - a = kzalloc(sizeof(*a), GFP_KERNEL); - if (a == NULL) - goto err_mod; - - a->ops = a_o; - INIT_LIST_HEAD(&a->list); /* backward compatibility for policer */ if (name == NULL) - err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind); else - err = a_o->init(net, nla, est, a, ovr, bind); + err = a_o->init(net, nla, est, &a, ovr, bind); if (err < 0) - goto err_free; + goto err_mod; /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then @@ -605,8 +590,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, return a; -err_free: - kfree(a); err_mod: module_put(a_o->owner); err_out: @@ -647,7 +630,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, { int err = 0; struct gnet_dump d; - struct tcf_common *p = a->priv; + struct tcf_common *p = (struct tcf_common *)a; if (p == NULL) goto errout; @@ -740,24 +723,11 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, return rtnl_unicast(skb, net, portid); } -static struct tc_action *create_a(int i) -{ - struct tc_action *act; - - act = kzalloc(sizeof(*act), GFP_KERNEL); - if (act == NULL) { - pr_debug("create_a: failed to alloc!\n"); - return NULL; - } - act->order = i; - INIT_LIST_HEAD(&act->list); - return act; -} - static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid) { struct nlattr *tb[TCA_ACT_MAX + 1]; + const struct tc_action_ops *ops; struct tc_action *a; int index; int err; @@ -772,26 +742,19 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, goto err_out; index = nla_get_u32(tb[TCA_ACT_INDEX]); - err = -ENOMEM; - a = create_a(0); - if (a == NULL) - goto err_out; - err = -EINVAL; - a->ops = tc_lookup_action(tb[TCA_ACT_KIND]); - if (a->ops == NULL) /* could happen in batch of actions */ - goto err_free; + ops = tc_lookup_action(tb[TCA_ACT_KIND]); + if (!ops) /* could happen in batch of actions */ + goto err_out; err = -ENOENT; - if (a->ops->lookup(net, a, index) == 0) + if (ops->lookup(net, &a, index) == 0) goto err_mod; - module_put(a->ops->owner); + module_put(ops->owner); return a; err_mod: - module_put(a->ops->owner); -err_free: - kfree(a); + module_put(ops->owner); err_out: return ERR_PTR(err); } @@ -816,8 +779,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, struct netlink_callback dcb; struct nlattr *nest; struct nlattr *tb[TCA_ACT_MAX + 1]; + const struct tc_action_ops *ops; struct nlattr *kind; - struct tc_action a; int err = -ENOMEM; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); @@ -834,10 +797,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, err = -EINVAL; kind = tb[TCA_ACT_KIND]; - memset(&a, 0, sizeof(struct tc_action)); - INIT_LIST_HEAD(&a.list); - a.ops = tc_lookup_action(kind); - if (a.ops == NULL) /*some idjot trying to flush unknown action */ + ops = tc_lookup_action(kind); + if (!ops) /*some idjot trying to flush unknown action */ goto err_out; nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, @@ -853,7 +814,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, if (nest == NULL) goto out_module_put; - err = a.ops->walk(net, skb, &dcb, RTM_DELACTION, &a); + err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); if (err < 0) goto out_module_put; if (err == 0) @@ -863,7 +824,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; - module_put(a.ops->owner); + module_put(ops->owner); err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err > 0) @@ -872,7 +833,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, return err; out_module_put: - module_put(a.ops->owner); + module_put(ops->owner); err_out: noflush_out: kfree_skb(skb); @@ -1084,7 +1045,6 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; struct tc_action_ops *a_o; - struct tc_action a; int ret = 0; struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); struct nlattr *kind = find_dump_kind(cb->nlh); @@ -1098,9 +1058,6 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (a_o == NULL) return 0; - memset(&a, 0, sizeof(struct tc_action)); - a.ops = a_o; - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) @@ -1114,7 +1071,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto out_module_put; - ret = a_o->walk(net, skb, cb, RTM_GETACTION, &a); + ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o); if (ret < 0) goto out_module_put; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index ef74bffa6101..bfa870731e74 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -34,11 +34,12 @@ struct tcf_bpf_cfg { }; static int bpf_net_id; +static struct tc_action_ops act_bpf_ops; static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { - struct tcf_bpf *prog = act->priv; + struct tcf_bpf *prog = to_bpf(act); struct bpf_prog *filter; int action, filter_res; bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; @@ -134,7 +135,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) { unsigned char *tp = skb_tail_pointer(skb); - struct tcf_bpf *prog = act->priv; + struct tcf_bpf *prog = to_bpf(act); struct tc_act_bpf opt = { .index = prog->tcf_index, .refcnt = prog->tcf_refcnt - ref, @@ -270,7 +271,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, } static int tcf_bpf_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *act, + struct nlattr *est, struct tc_action **act, int replace, int bind) { struct tc_action_net *tn = net_generic(net, bpf_net_id); @@ -295,7 +296,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, act, bind)) { ret = tcf_hash_create(tn, parm->index, est, act, - sizeof(*prog), bind, true); + &act_bpf_ops, bind, true); if (ret < 0) return ret; @@ -305,7 +306,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (bind) return 0; - tcf_hash_release(act, bind); + tcf_hash_release(*act, bind); if (!replace) return -EEXIST; } @@ -325,7 +326,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (ret < 0) goto out; - prog = to_bpf(act); + prog = to_bpf(*act); ASSERT_RTNL(); if (res != ACT_P_CREATED) @@ -343,7 +344,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, rcu_assign_pointer(prog->filter, cfg.filter); if (res == ACT_P_CREATED) { - tcf_hash_insert(tn, act); + tcf_hash_insert(tn, *act); } else { /* make sure the program being replaced is no longer executing */ synchronize_rcu(); @@ -353,7 +354,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return res; out: if (res == ACT_P_CREATED) - tcf_hash_cleanup(act, est); + tcf_hash_cleanup(*act, est); return ret; } @@ -362,20 +363,20 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind) { struct tcf_bpf_cfg tmp; - tcf_bpf_prog_fill_cfg(act->priv, &tmp); + tcf_bpf_prog_fill_cfg(to_bpf(act), &tmp); tcf_bpf_cfg_cleanup(&tmp); } static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, bpf_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, bpf_net_id); @@ -392,6 +393,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .init = tcf_bpf_init, .walk = tcf_bpf_walker, .lookup = tcf_bpf_search, + .size = sizeof(struct tcf_bpf), }; static __net_init int bpf_init_net(struct net *net) diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 35a5270f289d..eae07a2e774d 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -31,6 +31,7 @@ #define CONNMARK_TAB_MASK 3 static int connmark_net_id; +static struct tc_action_ops act_connmark_ops; static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) @@ -38,7 +39,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, const struct nf_conntrack_tuple_hash *thash; struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; - struct tcf_connmark_info *ca = a->priv; + struct tcf_connmark_info *ca = to_connmark(a); struct nf_conntrack_zone zone; struct nf_conn *c; int proto; @@ -96,7 +97,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { }; static int tcf_connmark_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -116,22 +117,22 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*ci), bind, false); + &act_connmark_ops, bind, false); if (ret) return ret; - ci = to_connmark(a); + ci = to_connmark(*a); ci->tcf_action = parm->action; ci->net = net; ci->zone = parm->zone; - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); ret = ACT_P_CREATED; } else { - ci = to_connmark(a); + ci = to_connmark(*a); if (bind) return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; /* replacing action and zone */ @@ -146,7 +147,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_connmark_info *ci = a->priv; + struct tcf_connmark_info *ci = to_connmark(a); struct tc_connmark opt = { .index = ci->tcf_index, @@ -173,14 +174,14 @@ nla_put_failure: static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, connmark_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_connmark_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -196,6 +197,7 @@ static struct tc_action_ops act_connmark_ops = { .init = tcf_connmark_init, .walk = tcf_connmark_walker, .lookup = tcf_connmark_search, + .size = sizeof(struct tcf_connmark_info), }; static __net_init int connmark_init_net(struct net *net) diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index dcd9ababd351..b5dbf633a863 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -43,9 +43,10 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { }; static int csum_net_id; +static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, csum_net_id); @@ -67,26 +68,26 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*p), bind, false); + &act_csum_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - p = to_tcf_csum(a); + p = to_tcf_csum(*a); spin_lock_bh(&p->tcf_lock); p->tcf_action = parm->action; p->update_flags = parm->update_flags; spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -496,7 +497,7 @@ fail: static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_csum *p = a->priv; + struct tcf_csum *p = to_tcf_csum(a); int action; u32 update_flags; @@ -534,7 +535,7 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_csum *p = a->priv; + struct tcf_csum *p = to_tcf_csum(a); struct tc_csum opt = { .update_flags = p->update_flags, .index = p->tcf_index, @@ -560,14 +561,14 @@ nla_put_failure: static int tcf_csum_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, csum_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_csum_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, csum_net_id); @@ -583,6 +584,7 @@ static struct tc_action_ops act_csum_ops = { .init = tcf_csum_init, .walk = tcf_csum_walker, .lookup = tcf_csum_search, + .size = sizeof(struct tcf_csum), }; static __net_init int csum_init_net(struct net *net) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 19058a7f3e5c..e24a4093d6f6 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -26,6 +26,7 @@ #define GACT_TAB_MASK 15 static int gact_net_id; +static struct tc_action_ops act_gact_ops; #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) @@ -56,7 +57,7 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { }; static int tcf_gact_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, gact_net_id); @@ -93,19 +94,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*gact), bind, true); + &act_gact_ops, bind, true); if (ret) return ret; ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - gact = to_gact(a); + gact = to_gact(*a); ASSERT_RTNL(); gact->tcf_action = parm->action; @@ -121,14 +122,14 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_gact *gact = a->priv; + struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); #ifdef CONFIG_GACT_PROB @@ -151,7 +152,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, u64 lastuse) { - struct tcf_gact *gact = a->priv; + struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; @@ -166,7 +167,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_gact *gact = a->priv; + struct tcf_gact *gact = to_gact(a); struct tc_gact opt = { .index = gact->tcf_index, .refcnt = gact->tcf_refcnt - ref, @@ -201,14 +202,14 @@ nla_put_failure: static int tcf_gact_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, gact_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_gact_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, gact_net_id); @@ -225,6 +226,7 @@ static struct tc_action_ops act_gact_ops = { .init = tcf_gact_init, .walk = tcf_gact_walker, .lookup = tcf_gact_search, + .size = sizeof(struct tcf_gact), }; static __net_init int gact_init_net(struct net *net) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 845ab5119c05..141a06eeb1e5 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -37,6 +37,7 @@ static int ife_net_id; static int max_metacnt = IFE_META_MAX + 1; +static struct tc_action_ops act_ife_ops; static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = { [TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)}, @@ -364,7 +365,7 @@ out_nlmsg_trim: /* under ife->tcf_lock */ static void _tcf_ife_cleanup(struct tc_action *a, int bind) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { @@ -382,7 +383,7 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind) static void tcf_ife_cleanup(struct tc_action *a, int bind) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); spin_lock_bh(&ife->tcf_lock); _tcf_ife_cleanup(a, bind); @@ -417,7 +418,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, } static int tcf_ife_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, ife_net_id); @@ -451,25 +452,25 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, **/ if (!tb[TCA_IFE_TYPE]) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); pr_info("You MUST pass etherype for encoding\n"); return -EINVAL; } } if (!exists) { - ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife), + ret = tcf_hash_create(tn, parm->index, est, a, &act_ife_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - ife = to_ife(a); + ife = to_ife(*a); ife->flags = parm->flags; if (parm->flags & IFE_ENCODE) { @@ -507,9 +508,9 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (err) { metadata_parse_err: if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(a, bind); + _tcf_ife_cleanup(*a, bind); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -529,7 +530,7 @@ metadata_parse_err: err = use_all_metadata(ife); if (err) { if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(a, bind); + _tcf_ife_cleanup(*a, bind); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -541,7 +542,7 @@ metadata_parse_err: spin_unlock_bh(&ife->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -550,7 +551,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); struct tc_ife opt = { .index = ife->tcf_index, .refcnt = ife->tcf_refcnt - ref, @@ -623,7 +624,7 @@ struct meta_tlvhdr { static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data; u16 ifehdrln = ifehdr->metalen; @@ -695,7 +696,7 @@ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ethhdr *oethh; /* outer ether header */ struct ethhdr *iethh; /* inner eth header */ @@ -799,7 +800,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); if (ife->flags & IFE_ENCODE) return tcf_ife_encode(skb, a, res); @@ -819,14 +820,14 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, static int tcf_ife_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, ife_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ife_net_id); @@ -843,6 +844,7 @@ static struct tc_action_ops act_ife_ops = { .init = tcf_ife_init, .walk = tcf_ife_walker, .lookup = tcf_ife_search, + .size = sizeof(struct tcf_ife_info), }; static __net_init int ife_init_net(struct net *net) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index b8c50600697a..378c1c976058 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -31,8 +31,10 @@ #define IPT_TAB_MASK 15 static int ipt_net_id; +static struct tc_action_ops act_ipt_ops; static int xt_net_id; +static struct tc_action_ops act_xt_ops; static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) @@ -90,8 +92,8 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { }; static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, - int bind) + struct nlattr *est, struct tc_action **a, + const struct tc_action_ops *ops, int ovr, int bind) { struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; @@ -118,19 +120,19 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } if (!exists) { - ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind, + ret = tcf_hash_create(tn, index, est, a, ops, bind, false); if (ret) return ret; @@ -138,13 +140,11 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, } else { if (bind)/* dont override defaults */ return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - ipt = to_ipt(a); - hook = nla_get_u32(tb[TCA_IPT_HOOK]); err = -ENOMEM; @@ -163,6 +163,8 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, if (err < 0) goto err3; + ipt = to_ipt(*a); + spin_lock_bh(&ipt->tcf_lock); if (ret != ACT_P_CREATED) { ipt_destroy_target(ipt->tcfi_t); @@ -174,7 +176,7 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; err3: @@ -183,33 +185,33 @@ err2: kfree(tname); err1: if (ret == ACT_P_CREATED) - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return err; } static int tcf_ipt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, ipt_net_id); - return __tcf_ipt_init(tn, nla, est, a, ovr, bind); + return __tcf_ipt_init(tn, nla, est, a, &act_ipt_ops, ovr, bind); } static int tcf_xt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, xt_net_id); - return __tcf_ipt_init(tn, nla, est, a, ovr, bind); + return __tcf_ipt_init(tn, nla, est, a, &act_xt_ops, ovr, bind); } static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { int ret = 0, result = 0; - struct tcf_ipt *ipt = a->priv; + struct tcf_ipt *ipt = to_ipt(a); struct xt_action_param par; if (skb_unclone(skb, GFP_ATOMIC)) @@ -259,7 +261,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_ipt *ipt = a->priv; + struct tcf_ipt *ipt = to_ipt(a); struct xt_entry_target *t; struct tcf_t tm; struct tc_cnt c; @@ -299,14 +301,14 @@ nla_put_failure: static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, ipt_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_ipt_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ipt_net_id); @@ -323,6 +325,7 @@ static struct tc_action_ops act_ipt_ops = { .init = tcf_ipt_init, .walk = tcf_ipt_walker, .lookup = tcf_ipt_search, + .size = sizeof(struct tcf_ipt), }; static __net_init int ipt_init_net(struct net *net) @@ -348,14 +351,14 @@ static struct pernet_operations ipt_net_ops = { static int tcf_xt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, xt_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_xt_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, xt_net_id); @@ -372,6 +375,7 @@ static struct tc_action_ops act_xt_ops = { .init = tcf_xt_init, .walk = tcf_xt_walker, .lookup = tcf_xt_search, + .size = sizeof(struct tcf_ipt), }; static __net_init int xt_init_net(struct net *net) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 70cfbbf96af2..6038c85d92f5 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -52,9 +52,10 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { }; static int mirred_net_id; +static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, mirred_net_id); @@ -84,14 +85,14 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, break; default: if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } if (parm->ifindex) { dev = __dev_get_by_index(net, parm->ifindex); if (dev == NULL) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -ENODEV; } switch (dev->type) { @@ -115,16 +116,16 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (dev == NULL) return -EINVAL; ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*m), bind, true); + &act_mirred_ops, bind, true); if (ret) return ret; ret = ACT_P_CREATED; } else { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - m = to_mirred(a); + m = to_mirred(*a); ASSERT_RTNL(); m->tcf_action = parm->action; @@ -142,7 +143,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, spin_lock_bh(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock_bh(&mirred_list_lock); - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); } return ret; @@ -151,7 +152,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_mirred *m = a->priv; + struct tcf_mirred *m = to_mirred(a); struct net_device *dev; struct sk_buff *skb2; int retval, err; @@ -206,7 +207,7 @@ out: static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_mirred *m = a->priv; + struct tcf_mirred *m = to_mirred(a); struct tc_mirred opt = { .index = m->tcf_index, .action = m->tcf_action, @@ -232,14 +233,14 @@ nla_put_failure: static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, mirred_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_mirred_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, mirred_net_id); @@ -284,6 +285,7 @@ static struct tc_action_ops act_mirred_ops = { .init = tcf_mirred_init, .walk = tcf_mirred_walker, .lookup = tcf_mirred_search, + .size = sizeof(struct tcf_mirred), }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 06ccb03f25da..8e8b0cc30704 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -32,13 +32,14 @@ #define NAT_TAB_MASK 15 static int nat_net_id; +static struct tc_action_ops act_nat_ops; static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, }; static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) + struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; @@ -59,18 +60,18 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*p), bind, false); + &act_nat_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { if (bind) return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - p = to_tcf_nat(a); + p = to_tcf_nat(*a); spin_lock_bh(&p->tcf_lock); p->old_addr = parm->old_addr; @@ -82,7 +83,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -90,7 +91,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_nat *p = a->priv; + struct tcf_nat *p = to_tcf_nat(a); struct iphdr *iph; __be32 old_addr; __be32 new_addr; @@ -248,7 +249,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_nat *p = a->priv; + struct tcf_nat *p = to_tcf_nat(a); struct tc_nat opt = { .old_addr = p->old_addr, .new_addr = p->new_addr, @@ -278,14 +279,14 @@ nla_put_failure: static int tcf_nat_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, nat_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_nat_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, nat_net_id); @@ -301,6 +302,7 @@ static struct tc_action_ops act_nat_ops = { .init = tcf_nat_init, .walk = tcf_nat_walker, .lookup = tcf_nat_search, + .size = sizeof(struct tcf_nat), }; static __net_init int nat_init_net(struct net *net) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 82d3c1479029..b54d56d4959b 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -26,13 +26,14 @@ #define PEDIT_TAB_MASK 15 static int pedit_net_id; +static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, }; static int tcf_pedit_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, pedit_net_id); @@ -61,23 +62,23 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (!parm->nkeys) return -EINVAL; ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*p), bind, false); + &act_pedit_ops, bind, false); if (ret) return ret; - p = to_pedit(a); + p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) { - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return -ENOMEM; } ret = ACT_P_CREATED; } else { if (bind) return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; - p = to_pedit(a); + p = to_pedit(*a); if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) @@ -96,13 +97,13 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, memcpy(p->tcfp_keys, parm->keys, ksize); spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } static void tcf_pedit_cleanup(struct tc_action *a, int bind) { - struct tcf_pedit *p = a->priv; + struct tcf_pedit *p = to_pedit(a); struct tc_pedit_key *keys = p->tcfp_keys; kfree(keys); } @@ -110,7 +111,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind) static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_pedit *p = a->priv; + struct tcf_pedit *p = to_pedit(a); int i; unsigned int off; @@ -177,7 +178,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_pedit *p = a->priv; + struct tcf_pedit *p = to_pedit(a); struct tc_pedit *opt; struct tcf_t t; int s; @@ -216,14 +217,14 @@ nla_put_failure: static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, pedit_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_pedit_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, pedit_net_id); @@ -240,6 +241,7 @@ static struct tc_action_ops act_pedit_ops = { .init = tcf_pedit_init, .walk = tcf_pedit_walker, .lookup = tcf_pedit_search, + .size = sizeof(struct tcf_pedit), }; static __net_init int pedit_init_net(struct net *net) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 1e8ede3955f4..123794af55c3 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -37,8 +37,8 @@ struct tcf_police { struct psched_ratecfg peak; bool peak_present; }; -#define to_police(pc) \ - container_of(pc->priv, struct tcf_police, common) + +#define to_police(pc) ((struct tcf_police *)pc) #define POL_TAB_MASK 15 @@ -56,15 +56,14 @@ struct tc_police_compat { /* Each policer is serialized by its individual spinlock */ static int police_net_id; +static struct tc_action_ops act_police_ops; static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, police_net_id); struct tcf_hashinfo *hinfo = tn->hinfo; - struct hlist_head *head; - struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; @@ -73,21 +72,22 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, s_i = cb->args[0]; for (i = 0; i < (POL_TAB_MASK + 1); i++) { + struct hlist_head *head; + struct tcf_common *p; + head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)]; hlist_for_each_entry_rcu(p, head, tcfc_head) { index++; if (index < s_i) continue; - a->priv = p; - a->order = index; - nest = nla_nest_start(skb, a->order); + nest = nla_nest_start(skb, index); if (nest == NULL) goto nla_put_failure; if (type == RTM_DELACTION) - err = tcf_action_dump_1(skb, a, 0, 1); + err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 1); else - err = tcf_action_dump_1(skb, a, 0, 0); + err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0); if (err < 0) { index--; nla_nest_cancel(skb, nest); @@ -116,7 +116,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { }; static int tcf_act_police_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { int ret = 0, err; @@ -142,13 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_POLICE_TBF]); if (parm->index) { - if (tcf_hash_search(tn, a, parm->index)) { - police = to_police(a); - if (bind) { - police->tcf_bindcnt += 1; - police->tcf_refcnt += 1; - return 0; - } + if (tcf_hash_check(tn, parm->index, a, bind)) { if (ovr) goto override; /* not replacing */ @@ -156,14 +150,14 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, } } else { ret = tcf_hash_create(tn, parm->index, NULL, a, - sizeof(*police), bind, false); + &act_police_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } - police = to_police(a); override: + police = to_police(*a); if (parm->rate.rate) { err = -ENOMEM; R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]); @@ -235,7 +229,7 @@ override: return ret; police->tcfp_t_c = ktime_get_ns(); - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; @@ -245,14 +239,14 @@ failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); if (ret == ACT_P_CREATED) - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return err; } static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_police *police = a->priv; + struct tcf_police *police = to_police(a); s64 now; s64 toks; s64 ptoks = 0; @@ -311,7 +305,7 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_police *police = a->priv; + struct tcf_police *police = to_police(a); struct tc_police opt = { .index = police->tcf_index, .action = police->tcf_action, @@ -349,7 +343,7 @@ nla_put_failure: return -1; } -static int tcf_police_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_police_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, police_net_id); @@ -369,6 +363,7 @@ static struct tc_action_ops act_police_ops = { .init = tcf_act_police_init, .walk = tcf_act_police_walker, .lookup = tcf_police_search, + .size = sizeof(struct tcf_police), }; static __net_init int police_init_net(struct net *net) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 318328d34d12..289af6f9bb3b 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -27,12 +27,13 @@ #define SIMP_TAB_MASK 7 static int simp_net_id; +static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_defact *d = a->priv; + struct tcf_defact *d = to_defact(a); spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); @@ -79,7 +80,7 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { }; static int tcf_simp_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, simp_net_id); @@ -100,7 +101,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (tb[TCA_DEF_PARMS] == NULL) return -EINVAL; - parm = nla_data(tb[TCA_DEF_PARMS]); exists = tcf_hash_check(tn, parm->index, a, bind); if (exists && bind) @@ -108,7 +108,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (tb[TCA_DEF_DATA] == NULL) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } @@ -116,22 +116,22 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*d), bind, false); + &act_simp_ops, bind, false); if (ret) return ret; - d = to_defact(a); + d = to_defact(*a); ret = alloc_defdata(d, defdata); if (ret < 0) { - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return ret; } d->tcf_action = parm->action; ret = ACT_P_CREATED; } else { - d = to_defact(a); + d = to_defact(*a); - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; @@ -139,7 +139,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -147,7 +147,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_defact *d = a->priv; + struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, .refcnt = d->tcf_refcnt - ref, @@ -172,14 +172,14 @@ nla_put_failure: static int tcf_simp_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, simp_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_simp_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, simp_net_id); @@ -196,6 +196,7 @@ static struct tc_action_ops act_simp_ops = { .init = tcf_simp_init, .walk = tcf_simp_walker, .lookup = tcf_simp_search, + .size = sizeof(struct tcf_defact), }; static __net_init int simp_init_net(struct net *net) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 8e573c0f8742..a133dcb82132 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -30,11 +30,12 @@ #define SKBEDIT_TAB_MASK 15 static int skbedit_net_id; +static struct tc_action_ops act_skbedit_ops; static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_skbedit *d = a->priv; + struct tcf_skbedit *d = to_skbedit(a); spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); @@ -63,7 +64,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); @@ -114,21 +115,21 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, return 0; if (!flags) { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*d), bind, false); + &act_skbedit_ops, bind, false); if (ret) return ret; - d = to_skbedit(a); + d = to_skbedit(*a); ret = ACT_P_CREATED; } else { - d = to_skbedit(a); - tcf_hash_release(a, bind); + d = to_skbedit(*a); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } @@ -150,7 +151,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&d->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -158,7 +159,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_skbedit *d = a->priv; + struct tcf_skbedit *d = to_skbedit(a); struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = d->tcf_refcnt - ref, @@ -194,14 +195,14 @@ nla_put_failure: static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_skbedit_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); @@ -217,6 +218,7 @@ static struct tc_action_ops act_skbedit_ops = { .init = tcf_skbedit_init, .walk = tcf_skbedit_walker, .lookup = tcf_skbedit_search, + .size = sizeof(struct tcf_skbedit), }; static __net_init int skbedit_init_net(struct net *net) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index db9b7ed570ba..691409de3e1a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -22,11 +22,12 @@ #define VLAN_TAB_MASK 15 static int vlan_net_id; +static struct tc_action_ops act_vlan_ops; static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_vlan *v = a->priv; + struct tcf_vlan *v = to_vlan(a); int action; int err; @@ -67,7 +68,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { }; static int tcf_vlan_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, vlan_net_id); @@ -100,13 +101,13 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, case TCA_VLAN_ACT_PUSH: if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); if (push_vid >= VLAN_VID_MASK) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -ERANGE; } @@ -125,25 +126,25 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, break; default: if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } action = parm->v_action; if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*v), bind, false); + &act_vlan_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - v = to_vlan(a); + v = to_vlan(*a); spin_lock_bh(&v->tcf_lock); @@ -156,7 +157,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&v->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -164,7 +165,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_vlan *v = a->priv; + struct tcf_vlan *v = to_vlan(a); struct tc_vlan opt = { .index = v->tcf_index, .refcnt = v->tcf_refcnt - ref, @@ -195,14 +196,14 @@ nla_put_failure: static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, vlan_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_vlan_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, vlan_net_id); @@ -218,6 +219,7 @@ static struct tc_action_ops act_vlan_ops = { .init = tcf_vlan_init, .walk = tcf_vlan_walker, .lookup = tcf_vlan_search, + .size = sizeof(struct tcf_vlan), }; static __net_init int vlan_init_net(struct net *net) -- cgit v1.2.3 From ec0595cc4495be579309b4bfd5e997af0f2ae6f9 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 25 Jul 2016 16:09:42 -0700 Subject: net_sched: get rid of struct tcf_common After the previous patch, struct tc_action should be enough to represent the generic tc action, tcf_common is not necessary any more. This patch gets rid of it to make tc action code more readable. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_api.c | 139 ++++++++++++++++++++++++------------------------- net/sched/act_police.c | 10 ++-- 2 files changed, 72 insertions(+), 77 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index d97419f35e7e..e4a5f2607ffa 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -29,46 +29,43 @@ static void free_tcf(struct rcu_head *head) { - struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu); + struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu); free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); kfree(p); } -static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a) +static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p) { - struct tcf_common *p = (struct tcf_common *)a; - spin_lock_bh(&hinfo->lock); - hlist_del(&p->tcfc_head); + hlist_del(&p->tcfa_head); spin_unlock_bh(&hinfo->lock); - gen_kill_estimator(&p->tcfc_bstats, - &p->tcfc_rate_est); + gen_kill_estimator(&p->tcfa_bstats, + &p->tcfa_rate_est); /* - * gen_estimator est_timer() might access p->tcfc_lock + * gen_estimator est_timer() might access p->tcfa_lock * or bstats, wait a RCU grace period before freeing p */ - call_rcu(&p->tcfc_rcu, free_tcf); + call_rcu(&p->tcfa_rcu, free_tcf); } -int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) +int __tcf_hash_release(struct tc_action *p, bool bind, bool strict) { - struct tcf_common *p = (struct tcf_common *)a; int ret = 0; if (p) { if (bind) - p->tcfc_bindcnt--; - else if (strict && p->tcfc_bindcnt > 0) + p->tcfa_bindcnt--; + else if (strict && p->tcfa_bindcnt > 0) return -EPERM; - p->tcfc_refcnt--; - if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { - if (a->ops->cleanup) - a->ops->cleanup(a, bind); - list_del(&a->list); - tcf_hash_destroy(a->hinfo, a); + p->tcfa_refcnt--; + if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) { + if (p->ops->cleanup) + p->ops->cleanup(p, bind); + list_del(&p->list); + tcf_hash_destroy(p->hinfo, p); ret = ACT_P_DELETED; } } @@ -89,11 +86,11 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, for (i = 0; i < (hinfo->hmask + 1); i++) { struct hlist_head *head; - struct tcf_common *p; + struct tc_action *p; head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; - hlist_for_each_entry_rcu(p, head, tcfc_head) { + hlist_for_each_entry_rcu(p, head, tcfa_head) { index++; if (index < s_i) continue; @@ -101,7 +98,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, nest = nla_nest_start(skb, n_i); if (nest == NULL) goto nla_put_failure; - err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0); + err = tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; nlmsg_trim(skb, nest); @@ -139,13 +136,13 @@ static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, for (i = 0; i < (hinfo->hmask + 1); i++) { struct hlist_head *head; struct hlist_node *n; - struct tcf_common *p; + struct tc_action *p; head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; - hlist_for_each_entry_safe(p, n, head, tcfc_head) { - ret = __tcf_hash_release((struct tc_action *)p, false, true); + hlist_for_each_entry_safe(p, n, head, tcfa_head) { + ret = __tcf_hash_release(p, false, true); if (ret == ACT_P_DELETED) { - module_put(p->tcfc_act.ops->owner); + module_put(p->ops->owner); n_i++; } else if (ret < 0) goto nla_put_failure; @@ -178,15 +175,15 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, } EXPORT_SYMBOL(tcf_generic_walker); -static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) +static struct tc_action *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) { - struct tcf_common *p = NULL; + struct tc_action *p = NULL; struct hlist_head *head; spin_lock_bh(&hinfo->lock); head = &hinfo->htab[tcf_hash(index, hinfo->hmask)]; - hlist_for_each_entry_rcu(p, head, tcfc_head) - if (p->tcfc_index == index) + hlist_for_each_entry_rcu(p, head, tcfa_head) + if (p->tcfa_index == index) break; spin_unlock_bh(&hinfo->lock); @@ -211,10 +208,10 @@ EXPORT_SYMBOL(tcf_hash_new_index); int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { struct tcf_hashinfo *hinfo = tn->hinfo; - struct tcf_common *p = tcf_hash_lookup(index, hinfo); + struct tc_action *p = tcf_hash_lookup(index, hinfo); if (p) { - *a = &p->tcfc_act; + *a = p; return 1; } return 0; @@ -225,12 +222,13 @@ bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a, int bind) { struct tcf_hashinfo *hinfo = tn->hinfo; - struct tcf_common *p = NULL; + struct tc_action *p = NULL; + if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) { if (bind) - p->tcfc_bindcnt++; - p->tcfc_refcnt++; - *a = &p->tcfc_act; + p->tcfa_bindcnt++; + p->tcfa_refcnt++; + *a = p; return true; } return false; @@ -239,11 +237,10 @@ EXPORT_SYMBOL(tcf_hash_check); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) { - struct tcf_common *pc = (struct tcf_common *)a; if (est) - gen_kill_estimator(&pc->tcfc_bstats, - &pc->tcfc_rate_est); - call_rcu(&pc->tcfc_rcu, free_tcf); + gen_kill_estimator(&a->tcfa_bstats, + &a->tcfa_rate_est); + call_rcu(&a->tcfa_rcu, free_tcf); } EXPORT_SYMBOL(tcf_hash_cleanup); @@ -251,15 +248,15 @@ int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats) { - struct tcf_common *p = kzalloc(ops->size, GFP_KERNEL); + struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_hashinfo *hinfo = tn->hinfo; int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; - p->tcfc_refcnt = 1; + p->tcfa_refcnt = 1; if (bind) - p->tcfc_bindcnt = 1; + p->tcfa_bindcnt = 1; if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); @@ -275,38 +272,37 @@ err2: goto err1; } } - spin_lock_init(&p->tcfc_lock); - INIT_HLIST_NODE(&p->tcfc_head); - p->tcfc_index = index ? index : tcf_hash_new_index(tn); - p->tcfc_tm.install = jiffies; - p->tcfc_tm.lastuse = jiffies; - p->tcfc_tm.firstuse = 0; + spin_lock_init(&p->tcfa_lock); + INIT_HLIST_NODE(&p->tcfa_head); + p->tcfa_index = index ? index : tcf_hash_new_index(tn); + p->tcfa_tm.install = jiffies; + p->tcfa_tm.lastuse = jiffies; + p->tcfa_tm.firstuse = 0; if (est) { - err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats, - &p->tcfc_rate_est, - &p->tcfc_lock, NULL, est); + err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, + &p->tcfa_rate_est, + &p->tcfa_lock, NULL, est); if (err) { free_percpu(p->cpu_qstats); goto err2; } } - p->tcfc_act.hinfo = hinfo; - p->tcfc_act.ops = ops; - INIT_LIST_HEAD(&p->tcfc_act.list); - *a = &p->tcfc_act; + p->hinfo = hinfo; + p->ops = ops; + INIT_LIST_HEAD(&p->list); + *a = p; return 0; } EXPORT_SYMBOL(tcf_hash_create); void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a) { - struct tcf_common *p = (struct tcf_common *)a; struct tcf_hashinfo *hinfo = tn->hinfo; - unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); + unsigned int h = tcf_hash(a->tcfa_index, hinfo->hmask); spin_lock_bh(&hinfo->lock); - hlist_add_head(&p->tcfc_head, &hinfo->htab[h]); + hlist_add_head(&a->tcfa_head, &hinfo->htab[h]); spin_unlock_bh(&hinfo->lock); } EXPORT_SYMBOL(tcf_hash_insert); @@ -317,13 +313,13 @@ void tcf_hashinfo_destroy(const struct tc_action_ops *ops, int i; for (i = 0; i < hinfo->hmask + 1; i++) { - struct tcf_common *p; + struct tc_action *p; struct hlist_node *n; - hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) { + hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfa_head) { int ret; - ret = __tcf_hash_release((struct tc_action *)p, false, true); + ret = __tcf_hash_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) @@ -625,12 +621,11 @@ err: return err; } -int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, +int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { int err = 0; struct gnet_dump d; - struct tcf_common *p = (struct tcf_common *)a; if (p == NULL) goto errout; @@ -639,27 +634,27 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, * to add additional backward compatibility statistic TLVs. */ if (compat_mode) { - if (a->type == TCA_OLD_COMPAT) + if (p->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, TCA_STATS, TCA_XSTATS, - &p->tcfc_lock, &d, + &p->tcfa_lock, &d, TCA_PAD); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, - &p->tcfc_lock, &d, TCA_ACT_PAD); + &p->tcfa_lock, &d, TCA_ACT_PAD); if (err < 0) goto errout; - if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfc_bstats) < 0 || - gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, - &p->tcfc_rate_est) < 0 || + if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || + gnet_stats_copy_rate_est(&d, &p->tcfa_bstats, + &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, - &p->tcfc_qstats, - p->tcfc_qstats.qlen) < 0) + &p->tcfa_qstats, + p->tcfa_qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 123794af55c3..b3c7e975fc9e 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -23,7 +23,7 @@ #include struct tcf_police { - struct tcf_common common; + struct tc_action common; int tcfp_result; u32 tcfp_ewma_rate; s64 tcfp_burst; @@ -73,11 +73,11 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, for (i = 0; i < (POL_TAB_MASK + 1); i++) { struct hlist_head *head; - struct tcf_common *p; + struct tc_action *p; head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)]; - hlist_for_each_entry_rcu(p, head, tcfc_head) { + hlist_for_each_entry_rcu(p, head, tcfa_head) { index++; if (index < s_i) continue; @@ -85,9 +85,9 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; if (type == RTM_DELACTION) - err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 1); + err = tcf_action_dump_1(skb, p, 0, 1); else - err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0); + err = tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; nla_nest_cancel(skb, nest); -- cgit v1.2.3 From d1c2b5010d07e967d7cbcc232a86b2308d824ca3 Mon Sep 17 00:00:00 2001 From: He Chunhui Date: Tue, 26 Jul 2016 06:16:52 +0000 Subject: net: neigh: disallow transition to NUD_STALE if lladdr is unchanged in neigh_update() NUD_STALE is used when the caller(e.g. arp_process()) can't guarantee neighbour reachability. If the entry was NUD_VALID and lladdr is unchanged, the entry state should not be changed. Currently the code puts an extra "NUD_CONNECTED" condition. So if old state was NUD_DELAY or NUD_PROBE (they are NUD_VALID but not NUD_CONNECTED), the state can be changed to NUD_STALE. This may cause problem. Because NUD_STALE lladdr doesn't guarantee reachability, when we send traffic, the state will be changed to NUD_DELAY. In normal case, if we get no confirmation (by dst_confirm()), we will change the state to NUD_PROBE and send probe traffic. But now the state may be reset to NUD_STALE again(e.g. by broadcast ARP packets), so the probe traffic will not be sent. This situation may happen again and again, and packets will be sent to an non-reachable lladdr forever. The fix is to remove the "NUD_CONNECTED" condition. After that the "NEIGH_UPDATE_F_WEAK_OVERRIDE" condition (used by IPv6) in that branch will be redundant, so remove it. This change may increase probe traffic, but it's essential since NUD_STALE lladdr is unreliable. To ensure correctness, we prefer to resolve lladdr, when we can't get confirmation, even while remote packets try to set NUD_STALE state. Signed-off-by: Chunhui He Signed-off-by: Julian Anastasov Reviewed-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/neighbour.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5cdc62a8eb84..cf26e04c4046 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1060,8 +1060,6 @@ static void neigh_update_hhs(struct neighbour *neigh) NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" lladdr instead of overriding it if it is different. - It also allows to retain current state - if lladdr is unchanged. NEIGH_UPDATE_F_ADMIN means that the change is administrative. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing @@ -1150,10 +1148,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } else goto out; } else { - if (lladdr == neigh->ha && new == NUD_STALE && - ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) || - (old & NUD_CONNECTED)) - ) + if (lladdr == neigh->ha && new == NUD_STALE) new = old; } } -- cgit v1.2.3 From 9ff26e9fabaf52f28fb5e875c0b9ffc2d1512039 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 26 Jul 2016 08:47:18 +0200 Subject: tipc: introduce constants for tipc address validation In this commit, we introduce defines for tipc address size, offset and mask specification for Zone.Cluster.Node. There is no functional change in this commit. Reviewed-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/addr.h | 5 +---- net/tipc/bearer.c | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 64f4004a6fac..bebb347803ce 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -43,9 +43,6 @@ #include #include "core.h" -#define TIPC_ZONE_MASK 0xff000000u -#define TIPC_CLUSTER_MASK 0xfffff000u - static inline u32 tipc_own_addr(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); @@ -60,7 +57,7 @@ static inline u32 tipc_zone_mask(u32 addr) static inline u32 tipc_cluster_mask(u32 addr) { - return addr & TIPC_CLUSTER_MASK; + return addr & TIPC_ZONE_CLUSTER_MASK; } u32 tipc_own_addr(struct net *net); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 4131d5a86f55..65b0998a9bab 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -225,7 +225,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, if (tipc_addr_domain_valid(disc_domain) && (disc_domain != tn->own_addr)) { if (tipc_in_scope(disc_domain, tn->own_addr)) { - disc_domain = tn->own_addr & TIPC_CLUSTER_MASK; + disc_domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK; res = 0; /* accept any node in own cluster */ } else if (in_own_cluster_exact(net, disc_domain)) res = 0; /* accept specified node in own cluster */ @@ -832,7 +832,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) u32 prio; prio = TIPC_MEDIA_LINK_PRI; - domain = tn->own_addr & TIPC_CLUSTER_MASK; + domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK; if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; -- cgit v1.2.3 From 7b3f52296493656015f0c0deddb6e90e36b9cda2 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 26 Jul 2016 08:47:19 +0200 Subject: tipc: make cluster size threshold for monitoring configurable In this commit, we introduce support to configure the minimum threshold to activate the new link monitoring algorithm. Reviewed-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/monitor.c | 12 ++++++++++++ net/tipc/monitor.h | 1 + net/tipc/netlink.c | 15 +++++++++++++-- net/tipc/netlink.h | 1 + net/tipc/node.c | 27 +++++++++++++++++++++++++++ net/tipc/node.h | 1 + 6 files changed, 55 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 0d489e81fcca..3892d05b8b45 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -649,3 +649,15 @@ void tipc_mon_delete(struct net *net, int bearer_id) kfree(self); kfree(mon); } + +int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size) +{ + struct tipc_net *tn = tipc_net(net); + + if (cluster_size > TIPC_CLUSTER_SIZE) + return -EINVAL; + + tn->mon_threshold = cluster_size; + + return 0; +} diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h index 598459cbed5d..91f5dd09432b 100644 --- a/net/tipc/monitor.h +++ b/net/tipc/monitor.h @@ -69,5 +69,6 @@ void tipc_mon_get_state(struct net *net, u32 addr, int bearer_id); void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id); +int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size); extern const int tipc_max_domain_size; #endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 56935df2167a..1e43ac0200ed 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -52,7 +52,8 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_MEDIA] = { .type = NLA_NESTED, }, [TIPC_NLA_NODE] = { .type = NLA_NESTED, }, [TIPC_NLA_NET] = { .type = NLA_NESTED, }, - [TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, } + [TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, }, + [TIPC_NLA_MON] = { .type = NLA_NESTED, }, }; const struct nla_policy @@ -61,6 +62,11 @@ tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = { [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED } }; +const struct nla_policy tipc_nl_monitor_policy[TIPC_NLA_MON_MAX + 1] = { + [TIPC_NLA_MON_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_MON_ACTIVATION_THRESHOLD] = { .type = NLA_U32 }, +}; + const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC }, [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 }, @@ -214,7 +220,12 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_NAME_TABLE_GET, .dumpit = tipc_nl_name_table_dump, .policy = tipc_nl_policy, - } + }, + { + .cmd = TIPC_NL_MON_SET, + .doit = tipc_nl_node_set_monitor, + .policy = tipc_nl_policy, + }, }; int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h index ed1dbcb4afbd..4ba0ad422110 100644 --- a/net/tipc/netlink.h +++ b/net/tipc/netlink.h @@ -55,6 +55,7 @@ extern const struct nla_policy tipc_nl_prop_policy[]; extern const struct nla_policy tipc_nl_bearer_policy[]; extern const struct nla_policy tipc_nl_media_policy[]; extern const struct nla_policy tipc_nl_udp_policy[]; +extern const struct nla_policy tipc_nl_monitor_policy[]; int tipc_netlink_start(void); int tipc_netlink_compat_start(void); diff --git a/net/tipc/node.c b/net/tipc/node.c index 95cc78b51532..0fc531d0f709 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1928,3 +1928,30 @@ out: return skb->len; } + +int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[TIPC_NLA_MON_MAX + 1]; + struct net *net = sock_net(skb->sk); + int err; + + if (!info->attrs[TIPC_NLA_MON]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_MON_MAX, + info->attrs[TIPC_NLA_MON], + tipc_nl_monitor_policy); + if (err) + return err; + + if (attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]) { + u32 val; + + val = nla_get_u32(attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]); + err = tipc_nl_monitor_set_threshold(net, val); + if (err) + return err; + } + + return 0; +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 8264b3d97dc4..65aa12ede8a5 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -78,4 +78,5 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info); #endif -- cgit v1.2.3 From bf1035b2ff5296c7c49e262152253ce29d87e82d Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 26 Jul 2016 08:47:20 +0200 Subject: tipc: get monitor threshold for the cluster In this commit, we add support to fetch the configured cluster monitoring threshold. Reviewed-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/monitor.c | 7 +++++++ net/tipc/monitor.h | 2 ++ net/tipc/netlink.c | 5 +++++ net/tipc/node.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/tipc/node.h | 1 + 5 files changed, 67 insertions(+) (limited to 'net') diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 3892d05b8b45..3579126e2ac8 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -661,3 +661,10 @@ int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size) return 0; } + +int tipc_nl_monitor_get_threshold(struct net *net) +{ + struct tipc_net *tn = tipc_net(net); + + return tn->mon_threshold; +} diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h index 91f5dd09432b..aedf62c60bd3 100644 --- a/net/tipc/monitor.h +++ b/net/tipc/monitor.h @@ -70,5 +70,7 @@ void tipc_mon_get_state(struct net *net, u32 addr, void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id); int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size); +int tipc_nl_monitor_get_threshold(struct net *net); + extern const int tipc_max_domain_size; #endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 1e43ac0200ed..2cfc5f7c6380 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -226,6 +226,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .doit = tipc_nl_node_set_monitor, .policy = tipc_nl_policy, }, + { + .cmd = TIPC_NL_MON_GET, + .doit = tipc_nl_node_get_monitor, + .policy = tipc_nl_policy, + }, }; int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) diff --git a/net/tipc/node.c b/net/tipc/node.c index 0fc531d0f709..2a7e74753f9f 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1955,3 +1955,55 @@ int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) return 0; } + +static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg) +{ + struct nlattr *attrs; + void *hdr; + u32 val; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + 0, TIPC_NL_MON_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_MON); + if (!attrs) + goto msg_full; + + val = tipc_nl_monitor_get_threshold(net); + + if (nla_put_u32(msg->skb, TIPC_NLA_MON_ACTIVATION_THRESHOLD, val)) + goto attr_msg_full; + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_nl_msg msg; + int err; + + msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + + err = __tipc_nl_add_monitor_prop(net, &msg); + if (err) { + nlmsg_free(msg.skb); + return err; + } + + return genlmsg_reply(msg.skb, info); +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 65aa12ede8a5..216f053b817f 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -79,4 +79,5 @@ int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); #endif -- cgit v1.2.3 From ff0d3e78a67a8edd09688f073361de9ed8abf9dc Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 26 Jul 2016 08:47:21 +0200 Subject: tipc: add a function to get the bearer name Introduce a new function to get the bearer name from its id. This is used in subsequent commit. Reviewed-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/bearer.c | 21 +++++++++++++++++++++ net/tipc/bearer.h | 1 + 2 files changed, 22 insertions(+) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 65b0998a9bab..65b1bbf133bd 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -171,6 +171,27 @@ struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name) return NULL; } +/* tipc_bearer_get_name - get the bearer name from its id. + * @net: network namespace + * @name: a pointer to the buffer where the name will be stored. + * @bearer_id: the id to get the name from. + */ +int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_bearer *b; + + if (bearer_id >= MAX_BEARERS) + return -EINVAL; + + b = rtnl_dereference(tn->bearer_list[bearer_id]); + if (!b) + return -EINVAL; + + strcpy(name, b->name); + return 0; +} + void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest) { struct tipc_net *tn = net_generic(net, tipc_net_id); diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index f1e6db5e6345..43757f1f9cb3 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -197,6 +197,7 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest); void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest); struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name); +int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id); struct tipc_media *tipc_media_find(const char *name); void tipc_bearer_reset_all(struct net *net); int tipc_bearer_setup(void); -- cgit v1.2.3 From cf6f7e1d51090772d5ff7355aaf0fcff17f20d1a Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 26 Jul 2016 08:47:22 +0200 Subject: tipc: dump monitor attributes In this commit, we dump the monitor attributes when queried. The link monitor attributes are separated into two kinds: 1. general attributes per bearer 2. specific attributes per node/peer This style resembles the socket attributes and the nametable publications per socket. Reviewed-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/monitor.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/tipc/monitor.h | 6 +++ net/tipc/netlink.c | 7 +++ net/tipc/node.c | 86 ++++++++++++++++++++++++++++++++++ net/tipc/node.h | 3 ++ 5 files changed, 235 insertions(+) (limited to 'net') diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 3579126e2ac8..be70a57c1ff9 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -33,9 +33,11 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include #include "core.h" #include "addr.h" #include "monitor.h" +#include "bearer.h" #define MAX_MON_DOMAIN 64 #define MON_TIMEOUT 120000 @@ -668,3 +670,134 @@ int tipc_nl_monitor_get_threshold(struct net *net) return tn->mon_threshold; } + +int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg) +{ + struct tipc_mon_domain *dom = peer->domain; + struct nlattr *attrs; + void *hdr; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + NLM_F_MULTI, TIPC_NL_MON_PEER_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_MON_PEER); + if (!attrs) + goto msg_full; + + if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_ADDR, peer->addr)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_APPLIED, peer->applied)) + goto attr_msg_full; + + if (peer->is_up) + if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_UP)) + goto attr_msg_full; + if (peer->is_local) + if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_LOCAL)) + goto attr_msg_full; + if (peer->is_head) + if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_HEAD)) + goto attr_msg_full; + + if (dom) { + if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_DOMGEN, dom->gen)) + goto attr_msg_full; + if (nla_put_u64_64bit(msg->skb, TIPC_NLA_MON_PEER_UPMAP, + dom->up_map, TIPC_NLA_MON_PEER_PAD)) + goto attr_msg_full; + if (nla_put(msg->skb, TIPC_NLA_MON_PEER_MEMBERS, + dom->member_cnt * sizeof(u32), &dom->members)) + goto attr_msg_full; + } + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, + u32 bearer_id, u32 *prev_node) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *peer = mon->self; + + if (!mon) + return -EINVAL; + + read_lock_bh(&mon->lock); + do { + if (*prev_node) { + if (peer->addr == *prev_node) + *prev_node = 0; + else + continue; + } + if (__tipc_nl_add_monitor_peer(peer, msg)) { + *prev_node = peer->addr; + read_unlock_bh(&mon->lock); + return -EMSGSIZE; + } + } while ((peer = peer_nxt(peer)) != mon->self); + read_unlock_bh(&mon->lock); + + return 0; +} + +int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, + u32 bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + char bearer_name[TIPC_MAX_BEARER_NAME]; + struct nlattr *attrs; + void *hdr; + int ret; + + ret = tipc_bearer_get_name(net, bearer_name, bearer_id); + if (ret || !mon) + return -EINVAL; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + NLM_F_MULTI, TIPC_NL_MON_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_MON); + if (!attrs) + goto msg_full; + + read_lock_bh(&mon->lock); + if (nla_put_u32(msg->skb, TIPC_NLA_MON_REF, bearer_id)) + goto attr_msg_full; + if (tipc_mon_is_active(net, mon)) + if (nla_put_flag(msg->skb, TIPC_NLA_MON_ACTIVE)) + goto attr_msg_full; + if (nla_put_string(msg->skb, TIPC_NLA_MON_BEARER_NAME, bearer_name)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEERCNT, mon->peer_cnt)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_MON_LISTGEN, mon->list_gen)) + goto attr_msg_full; + + read_unlock_bh(&mon->lock); + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + read_unlock_bh(&mon->lock); + + return -EMSGSIZE; +} diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h index aedf62c60bd3..2a21b93e0d04 100644 --- a/net/tipc/monitor.h +++ b/net/tipc/monitor.h @@ -36,6 +36,8 @@ #ifndef _TIPC_MONITOR_H #define _TIPC_MONITOR_H +#include "netlink.h" + /* struct tipc_mon_state: link instance's cache of monitor list and domain state * @list_gen: current generation of this node's monitor list * @gen: current generation of this node's local domain @@ -71,6 +73,10 @@ void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id); int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size); int tipc_nl_monitor_get_threshold(struct net *net); +int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, + u32 bearer_id); +int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, + u32 bearer_id, u32 *prev_node); extern const int tipc_max_domain_size; #endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 2cfc5f7c6380..a84daec0afe9 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -64,6 +64,7 @@ tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = { const struct nla_policy tipc_nl_monitor_policy[TIPC_NLA_MON_MAX + 1] = { [TIPC_NLA_MON_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_MON_REF] = { .type = NLA_U32 }, [TIPC_NLA_MON_ACTIVATION_THRESHOLD] = { .type = NLA_U32 }, }; @@ -229,6 +230,12 @@ static const struct genl_ops tipc_genl_v2_ops[] = { { .cmd = TIPC_NL_MON_GET, .doit = tipc_nl_node_get_monitor, + .dumpit = tipc_nl_node_dump_monitor, + .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_MON_PEER_GET, + .dumpit = tipc_nl_node_dump_monitor_peer, .policy = tipc_nl_policy, }, }; diff --git a/net/tipc/node.c b/net/tipc/node.c index 2a7e74753f9f..21974191e425 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2007,3 +2007,89 @@ int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info) return genlmsg_reply(msg.skb, info); } + +int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + u32 prev_bearer = cb->args[0]; + struct tipc_nl_msg msg; + int err; + int i; + + if (prev_bearer == MAX_BEARERS) + return 0; + + msg.skb = skb; + msg.portid = NETLINK_CB(cb->skb).portid; + msg.seq = cb->nlh->nlmsg_seq; + + rtnl_lock(); + for (i = prev_bearer; i < MAX_BEARERS; i++) { + prev_bearer = i; + err = __tipc_nl_add_monitor(net, &msg, prev_bearer); + if (err) + goto out; + } + +out: + rtnl_unlock(); + cb->args[0] = prev_bearer; + + return skb->len; +} + +int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + u32 prev_node = cb->args[1]; + u32 bearer_id = cb->args[2]; + int done = cb->args[0]; + struct tipc_nl_msg msg; + int err; + + if (!prev_node) { + struct nlattr **attrs; + struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; + + err = tipc_nlmsg_parse(cb->nlh, &attrs); + if (err) + return err; + + if (!attrs[TIPC_NLA_MON]) + return -EINVAL; + + err = nla_parse_nested(mon, TIPC_NLA_MON_MAX, + attrs[TIPC_NLA_MON], + tipc_nl_monitor_policy); + if (err) + return err; + + if (!mon[TIPC_NLA_MON_REF]) + return -EINVAL; + + bearer_id = nla_get_u32(mon[TIPC_NLA_MON_REF]); + + if (bearer_id >= MAX_BEARERS) + return -EINVAL; + } + + if (done) + return 0; + + msg.skb = skb; + msg.portid = NETLINK_CB(cb->skb).portid; + msg.seq = cb->nlh->nlmsg_seq; + + rtnl_lock(); + err = tipc_nl_add_monitor_peer(net, &msg, bearer_id, &prev_node); + if (!err) + done = 1; + + rtnl_unlock(); + cb->args[0] = done; + cb->args[1] = prev_node; + cb->args[2] = bearer_id; + + return skb->len; +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 216f053b817f..d69fdfcc0ec9 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -80,4 +80,7 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb); +int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, + struct netlink_callback *cb); #endif -- cgit v1.2.3 From 90b5ca1766ae7806a711d66df056af1290faa2c0 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 26 Jul 2016 18:54:52 +0200 Subject: net: ipmr/ip6mr: update lastuse on entry change Currently lastuse is updated on entry creation and cache hit, but it should also be updated on entry change. Since both on add and update the ttl array is updated we can simply update the lastuse in ipmr_update_thresholds. Signed-off-by: Nikolay Aleksandrov CC: Roopa Prabhu CC: Donald Sharp CC: David S. Miller Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 2 +- net/ipv6/ip6mr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index eec234161b89..26253328d227 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -722,6 +722,7 @@ static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache cache->mfc_un.res.maxvif = vifi + 1; } } + cache->mfc_un.res.lastuse = jiffies; } static int vif_add(struct net *net, struct mr_table *mrt, @@ -1150,7 +1151,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, c->mfc_origin = mfc->mfcc_origin.s_addr; c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; c->mfc_parent = mfc->mfcc_parent; - c->mfc_un.res.lastuse = jiffies; ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); if (!mrtsock) c->mfc_flags |= MFC_STATIC; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 7adce139d92a..6122f9c5cc49 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -921,6 +921,7 @@ static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *ca cache->mfc_un.res.maxvif = vifi + 1; } } + cache->mfc_un.res.lastuse = jiffies; } static int mif6_add(struct net *net, struct mr6_table *mrt, @@ -1500,7 +1501,6 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; c->mf6c_parent = mfc->mf6cc_parent; - c->mfc_un.res.lastuse = jiffies; ip6mr_update_thresholds(mrt, c, ttls); if (!mrtsock) c->mfc_flags |= MFC_STATIC; -- cgit v1.2.3 From 4ac36a4adaf80013a60013d6f829f5863d5d0e05 Mon Sep 17 00:00:00 2001 From: "phil.turnbull@oracle.com" Date: Tue, 26 Jul 2016 15:14:35 -0400 Subject: l2tp: Correctly return -EBADF from pppol2tp_getname. If 'tunnel' is NULL we should return -EBADF but the 'end_put_sess' path unconditionally sets 'error' back to zero. Rework the error path so it more closely matches pppol2tp_sendmsg. Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts") Signed-off-by: Phil Turnbull Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 652c250b9a3b..d9560aa2dba3 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -866,10 +866,8 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, pls = l2tp_session_priv(session); tunnel = l2tp_sock_to_tunnel(pls->tunnel_sock); - if (tunnel == NULL) { - error = -EBADF; + if (tunnel == NULL) goto end_put_sess; - } inet = inet_sk(tunnel->sock); if ((tunnel->version == 2) && (tunnel->sock->sk_family == AF_INET)) { @@ -947,12 +945,11 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, } *usockaddr_len = len; + error = 0; sock_put(pls->tunnel_sock); end_put_sess: sock_put(sk); - error = 0; - end: return error; } -- cgit v1.2.3