diff options
Diffstat (limited to 'net')
438 files changed, 14821 insertions, 6173 deletions
diff --git a/net/9p/client.c b/net/9p/client.c index fc1f3635e5dd..09f1ec589b80 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -811,7 +811,7 @@ reterr: * @uodata: source for zero copy write * @inlen: read buffer size * @olen: write buffer size - * @hdrlen: reader header size, This is the size of response protocol data + * @in_hdrlen: reader header size, This is the size of response protocol data * @fmt: protocol format string (see protocol.c) * * Returns request structure (which client must free using p9_tag_remove) diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index b21c3c209815..2885ff9c76f0 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -94,14 +94,15 @@ struct p9_trans_rdma { struct completion cm_done; }; +struct p9_rdma_req; + /** - * p9_rdma_context - Keeps track of in-process WR + * struct p9_rdma_context - Keeps track of in-process WR * * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive) */ -struct p9_rdma_req; struct p9_rdma_context { struct ib_cqe cqe; dma_addr_t busa; @@ -112,7 +113,7 @@ struct p9_rdma_context { }; /** - * p9_rdma_opts - Collection of mount options + * struct p9_rdma_opts - Collection of mount options * @port: port of connection * @sq_depth: The requested depth of the SQ. This really doesn't need * to be any deeper than the number of threads used in the client diff --git a/net/Kconfig b/net/Kconfig index d1672280d6a4..3831206977a1 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -455,7 +455,6 @@ config FAILOVER config ETHTOOL_NETLINK bool "Netlink interface for ethtool" default y - depends on PHYLIB=y || PHYLIB=n help An alternative userspace interface for ethtool based on generic netlink. It provides better extensibility and some new features, diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c index 550c6ca007cc..9c1241292d1d 100644 --- a/net/appletalk/atalk_proc.c +++ b/net/appletalk/atalk_proc.c @@ -229,6 +229,8 @@ int __init atalk_proc_init(void) sizeof(struct aarp_iter_state), NULL)) goto out; + return 0; + out: remove_proc_subtree("atalk", init_net.proc_net); return -ENOMEM; diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 15787e8c0629..1d48708c5a2e 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1917,8 +1917,6 @@ static const struct proto_ops atalk_dgram_ops = { #endif .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = atalk_sendmsg, .recvmsg = atalk_recvmsg, .mmap = sock_no_mmap, diff --git a/net/atm/common.c b/net/atm/common.c index 8575f5d52087..84367b844b14 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -745,7 +745,7 @@ static int check_qos(const struct atm_qos *qos) } int vcc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct atm_vcc *vcc; unsigned long value; @@ -760,7 +760,7 @@ int vcc_setsockopt(struct socket *sock, int level, int optname, { struct atm_qos qos; - if (copy_from_user(&qos, optval, sizeof(qos))) + if (copy_from_sockptr(&qos, optval, sizeof(qos))) return -EFAULT; error = check_qos(&qos); if (error) @@ -774,7 +774,7 @@ int vcc_setsockopt(struct socket *sock, int level, int optname, return 0; } case SO_SETCLP: - if (get_user(value, (unsigned long __user *)optval)) + if (copy_from_sockptr(&value, optval, sizeof(value))) return -EFAULT; if (value) vcc->atm_options |= ATM_ATMOPT_CLP; @@ -782,13 +782,8 @@ int vcc_setsockopt(struct socket *sock, int level, int optname, vcc->atm_options &= ~ATM_ATMOPT_CLP; return 0; default: - if (level == SOL_SOCKET) - return -EINVAL; - break; - } - if (!vcc->dev || !vcc->dev->ops->setsockopt) return -EINVAL; - return vcc->dev->ops->setsockopt(vcc, level, optname, optval, optlen); + } } int vcc_getsockopt(struct socket *sock, int level, int optname, @@ -826,13 +821,8 @@ int vcc_getsockopt(struct socket *sock, int level, int optname, return copy_to_user(optval, &pvc, sizeof(pvc)) ? -EFAULT : 0; } default: - if (level == SOL_SOCKET) - return -EINVAL; - break; - } - if (!vcc->dev || !vcc->dev->ops->getsockopt) return -EINVAL; - return vcc->dev->ops->getsockopt(vcc, level, optname, optval, len); + } } int register_atmdevice_notifier(struct notifier_block *nb) diff --git a/net/atm/common.h b/net/atm/common.h index 5850649068bb..a1e56e8de698 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -21,7 +21,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait); int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen); + sockptr_t optval, unsigned int optlen); int vcc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); void vcc_process_recv_queue(struct atm_vcc *vcc); diff --git a/net/atm/lec_arpc.h b/net/atm/lec_arpc.h index 1205d8792d28..39115fe074c4 100644 --- a/net/atm/lec_arpc.h +++ b/net/atm/lec_arpc.h @@ -44,7 +44,7 @@ struct lec_arp_table { u8 *tlvs; u32 sizeoftlvs; /* * LANE2: Each MAC address can have TLVs - * associated with it. sizeoftlvs tells the + * associated with it. sizeoftlvs tells * the length of the tlvs array */ struct sk_buff_head tx_wait; /* wait queue for outgoing packets */ diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 02bd2a436bdf..53e7d3f39e26 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -63,7 +63,7 @@ static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr, } static int pvc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int error; diff --git a/net/atm/svc.c b/net/atm/svc.c index ba144d035e3d..4a02bcaad279 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -451,7 +451,7 @@ int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) } static int svc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); @@ -464,7 +464,7 @@ static int svc_setsockopt(struct socket *sock, int level, int optname, error = -EINVAL; goto out; } - if (copy_from_user(&vcc->sap, optval, optlen)) { + if (copy_from_sockptr(&vcc->sap, optval, optlen)) { error = -EFAULT; goto out; } @@ -475,7 +475,7 @@ static int svc_setsockopt(struct socket *sock, int level, int optname, error = -EINVAL; goto out; } - if (get_user(value, (int __user *)optval)) { + if (copy_from_sockptr(&value, optval, sizeof(int))) { error = -EFAULT; goto out; } diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig index 97d686d115c0..d3a9843a043d 100644 --- a/net/ax25/Kconfig +++ b/net/ax25/Kconfig @@ -8,7 +8,7 @@ menuconfig HAMRADIO bool "Amateur Radio support" help If you want to connect your Linux box to an amateur radio, answer Y - here. You want to read <http://www.tapr.org/> + here. You want to read <https://www.tapr.org/> and more specifically about AX.25 on Linux <http://www.linux-ax25.org/>. @@ -39,11 +39,11 @@ config AX25 Information about where to get supporting software for Linux amateur radio as well as information about how to configure an AX.25 port is contained in the AX25-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. You might also want to + <https://www.tldp.org/docs.html#howto>. You might also want to check out the file <file:Documentation/networking/ax25.rst> in the kernel source. More information about digital amateur radio in general is on the WWW at - <http://www.tapr.org/>. + <https://www.tapr.org/>. To compile this driver as a module, choose M here: the module will be called ax25. @@ -90,7 +90,7 @@ config NETROM <http://www.linux-ax25.org>. You also might want to check out the file <file:Documentation/networking/ax25.rst>. More information about digital amateur radio in general is on the WWW at - <http://www.tapr.org/>. + <https://www.tapr.org/>. To compile this driver as a module, choose M here: the module will be called netrom. @@ -109,7 +109,7 @@ config ROSE <http://www.linux-ax25.org>. You also might want to check out the file <file:Documentation/networking/ax25.rst>. More information about digital amateur radio in general is on the WWW at - <http://www.tapr.org/>. + <https://www.tapr.org/>. To compile this driver as a module, choose M here: the module will be called rose. diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index dec3f35467c9..269ee89d2c2b 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -528,7 +528,7 @@ ax25_cb *ax25_create_cb(void) */ static int ax25_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; ax25_cb *ax25; @@ -543,7 +543,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(unsigned int)) return -EINVAL; - if (get_user(opt, (unsigned int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(unsigned int))) return -EFAULT; lock_sock(sk); @@ -640,7 +640,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, memset(devname, 0, sizeof(devname)); - if (copy_from_user(devname, optval, optlen)) { + if (copy_from_sockptr(devname, optval, optlen)) { res = -EFAULT; break; } diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index e87f19c82e8d..a4faf5f904d9 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -134,7 +134,7 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) * * Return: the originator object corresponding to the passed mac address or NULL * on failure. - * If the object does not exists it is created an initialised. + * If the object does not exist, it is created and initialised. */ static struct batadv_orig_node * batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) @@ -871,7 +871,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) } /** - * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface + * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface * @orig_node: originator which reproadcasted the OGMs directly * @if_outgoing: interface which transmitted the original OGM and received the * direct rebroadcast @@ -1075,10 +1075,10 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_neigh_ifinfo *neigh_ifinfo; u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; + unsigned int tq_iface_hop_penalty = BATADV_TQ_MAX_VALUE; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; - unsigned int tq_iface_penalty; bool ret = false; /* find corresponding one hop neighbor */ @@ -1157,31 +1157,32 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube; inv_asym_penalty /= neigh_rq_max_cube; tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty; + tq_iface_hop_penalty -= atomic_read(&if_incoming->hop_penalty); /* penalize if the OGM is forwarded on the same interface. WiFi * interfaces and other half duplex devices suffer from throughput * drops as they can't send and receive at the same time. */ - tq_iface_penalty = BATADV_TQ_MAX_VALUE; if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) - tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE, - bat_priv); + tq_iface_hop_penalty = batadv_hop_penalty(tq_iface_hop_penalty, + bat_priv); combined_tq = batadv_ogm_packet->tq * tq_own * tq_asym_penalty * - tq_iface_penalty; + tq_iface_hop_penalty; combined_tq /= BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE; batadv_ogm_packet->tq = combined_tq; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", + "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_hop_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", orig_node->orig, orig_neigh_node->orig, total_count, - neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_penalty, - batadv_ogm_packet->tq, if_incoming->net_dev->name, + neigh_rq_count, tq_own, tq_asym_penalty, + tq_iface_hop_penalty, batadv_ogm_packet->tq, + if_incoming->net_dev->name, if_outgoing ? if_outgoing->net_dev->name : "DEFAULT"); /* if link has the minimum required transmission quality @@ -1554,7 +1555,7 @@ static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) - * @if_incoming: the interface where this packet was receved + * @if_incoming: the interface where this packet was received */ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) @@ -2288,7 +2289,7 @@ batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information - * @single_hardif: Limit dump to this hard interfaace + * @single_hardif: Limit dump to this hard interface */ static void batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 0bdefa35da98..d35aca0e969a 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -60,7 +60,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) * @neigh: the neighbour for which the throughput has to be obtained * * Return: The throughput towards the given neighbour in multiples of 100kpbs - * (a value of '1' equals to 0.1Mbps, '10' equals 1Mbps, etc). + * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). */ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) { @@ -183,8 +183,8 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work) * * Sends a predefined number of unicast wifi packets to a given neighbour in * order to trigger the throughput estimation on this link by the RC algorithm. - * Packets are sent only if there there is not enough payload unicast traffic - * towards this neighbour.. + * Packets are sent only if there is not enough payload unicast traffic towards + * this neighbour.. * * Return: True on success and false in case of error during skb preparation. */ @@ -244,7 +244,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) * batadv_v_elp_periodic_work() - ELP periodic task per interface * @work: work queue item * - * Emits broadcast ELP message in regular intervals. + * Emits broadcast ELP messages in regular intervals. */ static void batadv_v_elp_periodic_work(struct work_struct *work) { @@ -499,7 +499,7 @@ orig_free: * @skb: the received packet * @if_incoming: the interface this packet was received through * - * Return: NET_RX_SUCCESS and consumes the skb if the packet was peoperly + * Return: NET_RX_SUCCESS and consumes the skb if the packet was properly * processed or NET_RX_DROP in case of failure. */ int batadv_v_elp_packet_recv(struct sk_buff *skb, diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 18028b9f95f0..0f8495b9eeb1 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -47,9 +47,9 @@ * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * - * Return: the orig_node corresponding to the specified address. If such object - * does not exist it is allocated here. In case of allocation failure returns - * NULL. + * Return: the orig_node corresponding to the specified address. If such an + * object does not exist, it is allocated here. In case of allocation failure + * returns NULL. */ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) @@ -172,7 +172,7 @@ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue * @hard_iface: the interface holding the aggregation queue * - * Empties the OGMv2 aggregation queue and frees all the skbs it contained. + * Empties the OGMv2 aggregation queue and frees all the skbs it contains. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ @@ -378,7 +378,7 @@ static void batadv_v_ogm_send(struct work_struct *work) * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface * @work: work queue item * - * Emits aggregated OGM message in regular intervals. + * Emits aggregated OGM messages in regular intervals. */ void batadv_v_ogm_aggr_work(struct work_struct *work) { @@ -399,7 +399,7 @@ void batadv_v_ogm_aggr_work(struct work_struct *work) * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V * @hard_iface: the interface to prepare * - * Takes care of scheduling own OGM sending routine for this interface. + * Takes care of scheduling its own OGM sending routine for this interface. * * Return: 0 on success or a negative error code otherwise */ @@ -455,15 +455,17 @@ unlock: * @throughput: the current throughput * * Apply a penalty on the current throughput metric value based on the - * characteristic of the interface where the OGM has been received. The return - * value is computed as follows: + * characteristic of the interface where the OGM has been received. + * + * Initially the per hardif hop penalty is applied to the throughput. After + * that the return value is then computed as follows: * - throughput * 50% if the incoming and outgoing interface are the * same WiFi interface and the throughput is above * 1MBit/s * - throughput if the outgoing interface is the default * interface (i.e. this OGM is processed for the * internal table and not forwarded) - * - throughput * hop penalty otherwise + * - throughput * node hop penalty otherwise * * Return: the penalised throughput metric. */ @@ -472,9 +474,14 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, u32 throughput) { + int if_hop_penalty = atomic_read(&if_incoming->hop_penalty); int hop_penalty = atomic_read(&bat_priv->hop_penalty); int hop_penalty_max = BATADV_TQ_MAX_VALUE; + /* Apply per hardif hop penalty */ + throughput = throughput * (hop_penalty_max - if_hop_penalty) / + hop_penalty_max; + /* Don't apply hop penalty in default originator table. */ if (if_outgoing == BATADV_IF_DEFAULT) return throughput; @@ -847,7 +854,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, * batadv_v_ogm_process() - process an incoming batman v OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) - * @if_incoming: the interface where this packet was receved + * @if_incoming: the interface where this packet was received */ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 41cc87f06b14..91a04ca373dc 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -992,7 +992,7 @@ static bool batadv_handle_claim(struct batadv_priv *bat_priv, * @hw_dst: the Hardware destination in the ARP Header * @ethhdr: pointer to the Ethernet header of the claim frame * - * checks if it is a claim packet and if its on the same group. + * checks if it is a claim packet and if it's on the same group. * This function also applies the group ID of the sender * if it is in the same mesh. * @@ -1757,7 +1757,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv) * @vid: the VLAN ID of the frame * * Checks if this packet is a loop detect frame which has been sent by us, - * throw an uevent and log the event if that is the case. + * throws an uevent and logs the event if that is the case. * * Return: true if it is a loop detect frame which is to be dropped, false * otherwise. @@ -1815,7 +1815,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, * * we have to race for a claim * * if the frame is allowed on the LAN * - * in these cases, the skb is further handled by this function + * In these cases, the skb is further handled by this function * * Return: true if handled, otherwise it returns false and the caller shall * further process the skb. diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b85da4b7a77b..0e6e53e9b5f3 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -666,7 +666,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, * @vid: VLAN identifier * @packet_subtype: unicast4addr packet subtype to use * - * This function copies the skb with pskb_copy() and is sent as unicast packet + * This function copies the skb with pskb_copy() and is sent as a unicast packet * to each of the selected candidates. * * Return: true if the packet is sent to at least one candidate, false diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 7cad97644d05..9fdbe3068153 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -102,8 +102,8 @@ static int batadv_frag_size_limit(void) * * Caller must hold chain->lock. * - * Return: true if chain is empty and caller can just insert the new fragment - * without searching for the right position. + * Return: true if chain is empty and the caller can just insert the new + * fragment without searching for the right position. */ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, u16 seqno) @@ -306,7 +306,7 @@ free: * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb * to NULL; 3) Error: Return false and free skb. * - * Return: true when packet is merged or buffered, false when skb is not not + * Return: true when the packet is merged or buffered, false when skb is not not * used. */ bool batadv_frag_skb_buffer(struct sk_buff **skb, diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 3a256af92784..fa06b51c0144 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -138,10 +138,10 @@ static bool batadv_mutual_parents(const struct net_device *dev1, * @net_dev: the device to check * * If the user creates any virtual device on top of a batman-adv interface, it - * is important to prevent this new interface to be used to create a new mesh - * network (this behaviour would lead to a batman-over-batman configuration). - * This function recursively checks all the fathers of the device passed as - * argument looking for a batman-adv soft interface. + * is important to prevent this new interface from being used to create a new + * mesh network (this behaviour would lead to a batman-over-batman + * configuration). This function recursively checks all the fathers of the + * device passed as argument looking for a batman-adv soft interface. * * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise @@ -680,8 +680,8 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) * @slave: the interface enslaved in another master * @master: the master from which slave has to be removed * - * Invoke ndo_del_slave on master passing slave as argument. In this way slave - * is free'd and master can correctly change its internal state. + * Invoke ndo_del_slave on master passing slave as argument. In this way the + * slave is free'd and the master can correctly change its internal state. * * Return: 0 on success, a negative value representing the error otherwise */ @@ -818,7 +818,7 @@ err: * @soft_iface: soft interface to check * * This function is only using RCU for locking - the result can therefore be - * off when another functions is modifying the list at the same time. The + * off when another function is modifying the list at the same time. The * caller can use the rtnl_lock to make sure that the count is accurate. * * Return: number of connected/enslaved hard interfaces @@ -939,6 +939,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; + atomic_set(&hard_iface->hop_penalty, 0); + batadv_v_hardif_init(hard_iface); batadv_check_known_mac_addr(hard_iface->net_dev); diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index f9884dc56cf3..979864c0fa6b 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -69,7 +69,7 @@ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) __printf(2, 3); /** - * _batadv_dbg() - Store debug output with(out) ratelimiting + * _batadv_dbg() - Store debug output with(out) rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @ratelimited: whether output should be rate limited @@ -95,7 +95,7 @@ static inline void _batadv_dbg(int type __always_unused, #endif /** - * batadv_dbg() - Store debug output without ratelimiting + * batadv_dbg() - Store debug output without rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @arg: format string and variable arguments @@ -104,7 +104,7 @@ static inline void _batadv_dbg(int type __always_unused, _batadv_dbg(type, bat_priv, 0, ## arg) /** - * batadv_dbg_ratelimited() - Store debug output with ratelimiting + * batadv_dbg_ratelimited() - Store debug output with rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @arg: format string and variable arguments diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d8a255c85e77..519c08c2cfba 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -666,7 +666,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * - * Return: true if AP isolation is on for the VLAN idenfied by vid, false + * Return: true if AP isolation is on for the VLAN identified by vid, false * otherwise */ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 61d8dbe8c954..0393bb9ed3d0 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2020.2" +#define BATADV_SOURCE_VERSION "2020.3" #endif /* B.A.T.M.A.N. parameters */ @@ -308,7 +308,7 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a predecessor - * unless the variable sequence number has grown by more then + * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: @@ -330,11 +330,11 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, /** * batadv_seq_after() - Checks if a sequence number x is a successor of y - * @x: potential sucessor of @y + * @x: potential successor of @y * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a successor - * unless the variable sequence number has grown by more then + * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 9ebdc1e864b9..bdc4a1fba1c6 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -510,7 +510,7 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * - * If there is a bridge interface on top of dev, collects from that one + * If there is a bridge interface on top of dev, collect from that one * instead. Just like with IP addresses and routes, multicast listeners * will(/should) register to the bridge interface instead of an * enslaved bat0. @@ -832,8 +832,8 @@ batadv_mcast_bridge_log(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @flags: TVLV flags indicating the new multicast state * - * Whenever the multicast TVLV flags this nodes announces change this notifies - * userspace via the 'mcast' log level. + * Whenever the multicast TVLV flags this node announces change, this function + * should be used to notify userspace about the change. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { @@ -1244,7 +1244,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) * @ethhdr: an ethernet header to determine the protocol family from * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or - * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and + * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, sets and * increases its refcount. */ static struct batadv_orig_node * @@ -1693,7 +1693,7 @@ batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_send() - send packet to any detected multicast recpient + * batadv_mcast_forw_send() - send packet to any detected multicast recipient * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier @@ -1742,7 +1742,8 @@ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, - * orig, has toggled then this method updates counter and list accordingly. + * orig, has toggled then this method updates the counter and the list + * accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1787,7 +1788,7 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1832,7 +1833,7 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1877,7 +1878,7 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1922,7 +1923,7 @@ static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 02ed073f95a9..dc193618a761 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -640,7 +640,7 @@ batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) * @bat_priv: the bat priv with all the soft interface information * @dst: destination of tp_meter session * @result: reason for tp meter session stop - * @test_time: total time ot the tp_meter session + * @test_time: total time of the tp_meter session * @total_bytes: bytes acked to the receiver * @cookie: cookie of tp_meter session * @@ -826,6 +826,10 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg, goto nla_put_failure; } + if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, + atomic_read(&hard_iface->hop_penalty))) + goto nla_put_failure; + #ifdef CONFIG_BATMAN_ADV_BATMAN_V if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL, atomic_read(&hard_iface->bat_v.elp_interval))) @@ -920,9 +924,15 @@ static int batadv_netlink_set_hardif(struct sk_buff *skb, { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; + struct nlattr *attr; + + if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { + attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; + + atomic_set(&hard_iface->hop_penalty, nla_get_u8(attr)); + } #ifdef CONFIG_BATMAN_ADV_BATMAN_V - struct nlattr *attr; if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) { attr = info->attrs[BATADV_ATTR_ELP_INTERVAL]; diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index b0469d15da0e..48d707850f3e 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -134,7 +134,7 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, } /** - * batadv_nc_mesh_init() - initialise coding hash table and start house keeping + * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure @@ -700,7 +700,7 @@ batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, } /** - * batadv_nc_worker() - periodic task for house keeping related to network + * batadv_nc_worker() - periodic task for housekeeping related to network * coding * @work: kernel work struct */ @@ -1316,7 +1316,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, } /** - * batadv_nc_skb_src_search() - Loops through the list of neighoring nodes of + * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of * the skb's sender (may be equal to the originator). * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to forward @@ -1402,10 +1402,10 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, * @neigh_node: next hop to forward packet to * @ethhdr: pointer to the ethernet header inside the skb * - * Loops through list of neighboring nodes the next hop has a good connection to - * (receives OGMs with a sufficient quality). We need to find a neighbor of our - * next hop that potentially sent a packet which our next hop also received - * (overheard) and has stored for later decoding. + * Loops through the list of neighboring nodes the next hop has a good + * connection to (receives OGMs with a sufficient quality). We need to find a + * neighbor of our next hop that potentially sent a packet which our next hop + * also received (overheard) and has stored for later decoding. * * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 5b0c2fffc214..805d8969bdfb 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -325,7 +325,7 @@ void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * - * Return: the neighbor which should be router for this orig_node/iface. + * Return: the neighbor which should be the router for this orig_node/iface. * * The object is returned with refcounter increased by 1. */ @@ -515,7 +515,7 @@ out: * Looks for and possibly returns a neighbour belonging to this originator list * which is connected through the provided hard interface. * - * Return: neighbor when found. Othwerwise NULL + * Return: neighbor when found. Otherwise NULL */ static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, @@ -620,7 +620,7 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, * * Looks for and possibly returns a neighbour belonging to this hard interface. * - * Return: neighbor when found. Othwerwise NULL + * Return: neighbor when found. Otherwise NULL */ struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, @@ -999,7 +999,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the originator * - * Creates a new originator object and initialise all the generic fields. + * Creates a new originator object and initialises all the generic fields. * The new object is not added to the originator list. * * Return: the newly created object or NULL on failure. diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index d343382e9664..27cdf5e4349a 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -449,7 +449,7 @@ free_skb: * @skb: packet to check * @hdr_size: size of header to pull * - * Check for short header and bad addresses in given packet. + * Checks for short header and bad addresses in the given packet. * * Return: negative value when check fails and 0 otherwise. The negative value * depends on the reason: -ENODATA for bad header, -EBADR for broadcast @@ -1113,7 +1113,7 @@ free_skb: * @recv_if: interface that the skb is received on * * This function does one of the three following things: 1) Forward fragment, if - * the assembled packet will exceed our MTU; 2) Buffer fragment, if we till + * the assembled packet will exceed our MTU; 2) Buffer fragment, if we still * lack further fragments; 3) Merge fragments, if we have all needed parts. * * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 7f8ade04e08e..d267b94800d6 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -605,8 +605,8 @@ bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet, * given hard_iface. If hard_iface is NULL forwarding packets on all hard * interfaces will be claimed. * - * The packets are being moved from the forw_list to the cleanup_list and - * by that allows already running threads to notice the claiming. + * The packets are being moved from the forw_list to the cleanup_list. This + * makes it possible for already running threads to notice the claim. */ static void batadv_forw_packet_list_steal(struct hlist_head *forw_list, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index f1f1c86f3419..23833a0ba5e6 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -406,7 +406,7 @@ end: * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * - * Sends a ethernet frame to the receive path of the local @soft_iface. + * Sends an ethernet frame to the receive path of the local @soft_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index bd2ac570c42c..db7e3774825b 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -66,7 +66,7 @@ /** * BATADV_TP_MAX_RTO - Maximum sender timeout. If the sender RTO gets beyond - * such amound of milliseconds, the receiver is considered unreachable and the + * such amount of milliseconds, the receiver is considered unreachable and the * connection is killed */ #define BATADV_TP_MAX_RTO 30000 @@ -108,10 +108,10 @@ static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid) * batadv_tp_cwnd() - compute the new cwnd size * @base: base cwnd size value * @increment: the value to add to base to get the new size - * @min: minumim cwnd value (usually MSS) + * @min: minimum cwnd value (usually MSS) * - * Return the new cwnd size and ensures it does not exceed the Advertised - * Receiver Window size. It is wrap around safe. + * Return the new cwnd size and ensure it does not exceed the Advertised + * Receiver Window size. It is wrapped around safely. * For details refer to Section 3.1 of RFC5681 * * Return: new congestion window size in bytes @@ -254,7 +254,7 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, * @dst: the other endpoint MAC address to look for * * Look for a tp_vars object matching dst as end_point and return it after - * having incremented the refcounter. Return NULL is not found + * having increment the refcounter. Return NULL is not found * * Return: matching tp_vars or NULL when no tp_vars with @dst was found */ @@ -291,7 +291,7 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, * @session: session identifier * * Look for a tp_vars object matching dst as end_point, session as tp meter - * session and return it after having incremented the refcounter. Return NULL + * session and return it after having increment the refcounter. Return NULL * is not found * * Return: matching tp_vars or NULL when no tp_vars was found diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index a9635c882fe0..98a0aaaf0d50 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -301,7 +301,7 @@ void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) * @vid: VLAN identifier * * Return: the number of originators advertising the given address/data - * (excluding ourself). + * (excluding our self). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) @@ -842,7 +842,7 @@ out: * table. In case of success the value is updated with the real amount of * reserved bytes * Allocate the needed amount of memory for the entire TT TVLV and write its - * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data + * header made up of one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN served by the originator node. * * Return: the size of the allocated buffer or 0 in case of failure. @@ -1674,7 +1674,7 @@ out: * the function argument. * If a TT local entry exists for this non-mesh client remove it. * - * The caller must hold orig_node refcount. + * The caller must hold the orig_node refcount. * * Return: true if the new entry has been added, false otherwise */ @@ -1839,7 +1839,7 @@ out: * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: global translation table entry to be analyzed * - * This functon assumes the caller holds rcu_read_lock(). + * This function assumes the caller holds rcu_read_lock(). * Return: best originator list entry or NULL on errors. */ static struct batadv_tt_orig_list_entry * @@ -1887,7 +1887,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, * @tt_global_entry: global translation table entry to be printed * @seq: debugfs table seq_file struct * - * This functon assumes the caller holds rcu_read_lock(). + * This function assumes the caller holds rcu_read_lock(). */ static void batadv_tt_global_print_entry(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 0963a43ad996..6a23a566cde1 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -353,8 +353,8 @@ end: * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * - * Return: success if handler was not found or the return value of the handler - * callback. + * Return: success if the handler was not found or the return value of the + * handler callback. */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index d152b8e81f61..ed519efa3c36 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -208,6 +208,12 @@ struct batadv_hard_iface { /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; + /** + * @hop_penalty: penalty which will be applied to the tq-field + * of an OGM received via this interface + */ + atomic_t hop_penalty; + /** @bat_iv: per hard-interface B.A.T.M.A.N. IV data */ struct batadv_hard_iface_bat_iv bat_iv; @@ -455,8 +461,8 @@ struct batadv_orig_node { spinlock_t tt_buff_lock; /** - * @tt_lock: prevents from updating the table while reading it. Table - * update is made up by two operations (data structure update and + * @tt_lock: avoids concurrent read from and write to the table. Table + * update is made up of two operations (data structure update and * metadata -CRC/TTVN-recalculation) and they have to be executed * atomically in order to avoid another thread to read the * table/metadata between those. @@ -748,7 +754,7 @@ struct batadv_neigh_ifinfo { * struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression */ struct batadv_bcast_duplist_entry { - /** @orig: mac address of orig node orginating the broadcast */ + /** @orig: mac address of orig node originating the broadcast */ u8 orig[ETH_ALEN]; /** @crc: crc32 checksum of broadcast payload */ @@ -1010,7 +1016,7 @@ struct batadv_priv_tt { /** * @commit_lock: prevents from executing a local TT commit while reading - * the local table. The local TT commit is made up by two operations + * the local table. The local TT commit is made up of two operations * (data structure update and metadata -CRC/TTVN- recalculation) and * they have to be executed atomically in order to avoid another thread * to read the table/metadata between those. @@ -1024,7 +1030,7 @@ struct batadv_priv_tt { #ifdef CONFIG_BATMAN_ADV_BLA /** - * struct batadv_priv_bla - per mesh interface bridge loope avoidance data + * struct batadv_priv_bla - per mesh interface bridge loop avoidance data */ struct batadv_priv_bla { /** @num_requests: number of bla requests in flight */ @@ -1718,7 +1724,7 @@ struct batadv_priv { spinlock_t softif_vlan_list_lock; #ifdef CONFIG_BATMAN_ADV_BLA - /** @bla: bridge loope avoidance data */ + /** @bla: bridge loop avoidance data */ struct batadv_priv_bla bla; #endif diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index bb55d92691b0..cff4944d5b66 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -50,6 +50,7 @@ static bool enable_6lowpan; /* We are listening incoming connections via this channel */ static struct l2cap_chan *listen_chan; +static DEFINE_MUTEX(set_lock); struct lowpan_peer { struct list_head list; @@ -1078,12 +1079,14 @@ static void do_enable_set(struct work_struct *work) enable_6lowpan = set_enable->flag; + mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); } listen_chan = bt_6lowpan_listen(); + mutex_unlock(&set_lock); kfree(set_enable); } @@ -1135,11 +1138,13 @@ static ssize_t lowpan_control_write(struct file *fp, if (ret == -EINVAL) return ret; + mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_close(listen_chan, 0); l2cap_chan_put(listen_chan); listen_chan = NULL; } + mutex_unlock(&set_lock); if (conn) { struct lowpan_peer *peer; diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 1d6d243cdde9..e2497d764e97 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -21,7 +21,7 @@ menuconfig BT It was designed as a replacement for cables and other short-range technologies like IrDA. Bluetooth operates in personal area range that typically extends up to 10 meters. More information about - Bluetooth can be found at <http://www.bluetooth.com/>. + Bluetooth can be found at <https://www.bluetooth.com/>. Linux Bluetooth subsystem consist of several layers: Bluetooth Core diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 41dd541a44a5..1c645fba8c49 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -14,7 +14,7 @@ bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ - ecdh_helper.o hci_request.o mgmt_util.o + ecdh_helper.o hci_request.o mgmt_util.o mgmt_config.o bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 3fd124927d4d..4ef6a54403aa 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -286,6 +286,9 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (msg->msg_name && bt_sk(sk)->skb_msg_name) bt_sk(sk)->skb_msg_name(skb, msg->msg_name, &msg->msg_namelen); + + if (bt_sk(sk)->skb_put_cmsg) + bt_sk(sk)->skb_put_cmsg(skb, msg, sk); } skb_free_datagram(sk, skb); @@ -453,8 +456,6 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; __poll_t mask = 0; - BT_DBG("sock %p, sk %p", sock, sk); - poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == BT_LISTEN) diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index cfd83c5521ae..d515571b2afb 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -182,8 +182,6 @@ static const struct proto_ops bnep_sock_ops = { .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index defdd4871919..96d49d9fae96 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -185,8 +185,6 @@ static const struct proto_ops cmtp_sock_ops = { .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 307800fd18e6..9832f8445d43 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -789,11 +789,8 @@ static void set_ext_conn_params(struct hci_conn *conn, memset(p, 0, sizeof(*p)); - /* Set window to be the same value as the interval to - * enable continuous scanning. - */ - p->scan_interval = cpu_to_le16(hdev->le_scan_interval); - p->scan_window = p->scan_interval; + p->scan_interval = cpu_to_le16(hdev->le_scan_int_connect); + p->scan_window = cpu_to_le16(hdev->le_scan_window_connect); p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); p->conn_latency = cpu_to_le16(conn->le_conn_latency); @@ -875,11 +872,8 @@ static void hci_req_add_le_create_conn(struct hci_request *req, memset(&cp, 0, sizeof(cp)); - /* Set window to be the same value as the interval to enable - * continuous scanning. - */ - cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); - cp.scan_window = cp.scan_interval; + cp.scan_interval = cpu_to_le16(hdev->le_scan_int_connect); + cp.scan_window = cpu_to_le16(hdev->le_scan_window_connect); bacpy(&cp.peer_addr, &conn->dst); cp.peer_addr_type = conn->dst_type; @@ -937,7 +931,7 @@ static void hci_req_directed_advertising(struct hci_request *req, * So it is required to remove adv set for handle 0x00. since we use * instance 0 for directed adv. */ - hci_req_add(req, HCI_OP_LE_REMOVE_ADV_SET, sizeof(cp.handle), &cp.handle); + __hci_req_remove_ext_adv_instance(req, cp.handle); hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); @@ -1009,6 +1003,11 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, struct hci_request req; int err; + /* This ensures that during disable le_scan address resolution + * will not be disabled if it is followed by le_create_conn + */ + bool rpa_le_conn = true; + /* Let's make sure that le is enabled.*/ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { if (lmp_le_capable(hdev)) @@ -1109,7 +1108,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, * state. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { - hci_req_add_le_scan_disable(&req); + hci_req_add_le_scan_disable(&req, rpa_le_conn); hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); } @@ -1180,7 +1179,8 @@ static int hci_explicit_conn_params_set(struct hci_dev *hdev, /* This function requires the caller holds hdev->lock */ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, - u16 conn_timeout) + u16 conn_timeout, + enum conn_reasons conn_reason) { struct hci_conn *conn; @@ -1225,6 +1225,7 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, conn->sec_level = BT_SECURITY_LOW; conn->pending_sec_level = sec_level; conn->conn_timeout = conn_timeout; + conn->conn_reason = conn_reason; hci_update_background_scan(hdev); @@ -1234,7 +1235,8 @@ done: } struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, - u8 sec_level, u8 auth_type) + u8 sec_level, u8 auth_type, + enum conn_reasons conn_reason) { struct hci_conn *acl; @@ -1254,6 +1256,7 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, hci_conn_hold(acl); + acl->conn_reason = conn_reason; if (acl->state == BT_OPEN || acl->state == BT_CLOSED) { acl->sec_level = BT_SECURITY_LOW; acl->pending_sec_level = sec_level; @@ -1270,7 +1273,8 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, struct hci_conn *acl; struct hci_conn *sco; - acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING); + acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING, + CONN_REASON_SCO_CONNECT); if (IS_ERR(acl)) return acl; @@ -1323,6 +1327,23 @@ int hci_conn_check_link_mode(struct hci_conn *conn) return 0; } + /* AES encryption is required for Level 4: + * + * BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 3, Part C + * page 1319: + * + * 128-bit equivalent strength for link and encryption keys + * required using FIPS approved algorithms (E0 not allowed, + * SAFER+ not allowed, and P-192 not allowed; encryption key + * not shortened) + */ + if (conn->sec_level == BT_SECURITY_FIPS && + !test_bit(HCI_CONN_AES_CCM, &conn->flags)) { + bt_dev_err(conn->hdev, + "Invalid security: Missing AES-CCM usage"); + return 0; + } + if (hci_conn_ssp_enabled(conn) && !test_bit(HCI_CONN_ENCRYPT, &conn->flags)) return 0; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index dbe2d79f233f..68bfe57b6625 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -26,7 +26,6 @@ /* Bluetooth HCI core. */ #include <linux/export.h> -#include <linux/idr.h> #include <linux/rfkill.h> #include <linux/debugfs.h> #include <linux/crypto.h> @@ -606,7 +605,8 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) if (hdev->commands[8] & 0x01) hci_req_add(req, HCI_OP_READ_PAGE_SCAN_ACTIVITY, 0, NULL); - if (hdev->commands[18] & 0x04) + if (hdev->commands[18] & 0x04 && + !test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) hci_req_add(req, HCI_OP_READ_DEF_ERR_DATA_REPORTING, 0, NULL); /* Some older Broadcom based Bluetooth 1.2 controllers do not @@ -763,6 +763,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL); } + if (hdev->commands[35] & 0x40) { + __le16 rpa_timeout = cpu_to_le16(hdev->rpa_timeout); + + /* Set RPA timeout */ + hci_req_add(req, HCI_OP_LE_SET_RPA_TIMEOUT, 2, + &rpa_timeout); + } + if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { /* Read LE Maximum Data Length */ hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL); @@ -851,7 +859,8 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) /* Set erroneous data reporting if supported to the wideband speech * setting value */ - if (hdev->commands[18] & 0x08) { + if (hdev->commands[18] & 0x08 && + !test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) { bool enabled = hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED); @@ -2982,7 +2991,7 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, adv_instance->remaining_time = timeout; if (duration == 0) - adv_instance->duration = HCI_DEFAULT_ADV_DURATION; + adv_instance->duration = hdev->def_multi_adv_rotation_duration; else adv_instance->duration = duration; @@ -2996,6 +3005,94 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, return 0; } +/* This function requires the caller holds hdev->lock */ +void hci_adv_monitors_clear(struct hci_dev *hdev) +{ + struct adv_monitor *monitor; + int handle; + + idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) + hci_free_adv_monitor(monitor); + + idr_destroy(&hdev->adv_monitors_idr); +} + +void hci_free_adv_monitor(struct adv_monitor *monitor) +{ + struct adv_pattern *pattern; + struct adv_pattern *tmp; + + if (!monitor) + return; + + list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) + kfree(pattern); + + kfree(monitor); +} + +/* This function requires the caller holds hdev->lock */ +int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) +{ + int min, max, handle; + + if (!monitor) + return -EINVAL; + + min = HCI_MIN_ADV_MONITOR_HANDLE; + max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES; + handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max, + GFP_KERNEL); + if (handle < 0) + return handle; + + hdev->adv_monitors_cnt++; + monitor->handle = handle; + + hci_update_background_scan(hdev); + + return 0; +} + +static int free_adv_monitor(int id, void *ptr, void *data) +{ + struct hci_dev *hdev = data; + struct adv_monitor *monitor = ptr; + + idr_remove(&hdev->adv_monitors_idr, monitor->handle); + hci_free_adv_monitor(monitor); + + return 0; +} + +/* This function requires the caller holds hdev->lock */ +int hci_remove_adv_monitor(struct hci_dev *hdev, u16 handle) +{ + struct adv_monitor *monitor; + + if (handle) { + monitor = idr_find(&hdev->adv_monitors_idr, handle); + if (!monitor) + return -ENOENT; + + idr_remove(&hdev->adv_monitors_idr, monitor->handle); + hci_free_adv_monitor(monitor); + } else { + /* Remove all monitors if handle is 0. */ + idr_for_each(&hdev->adv_monitors_idr, &free_adv_monitor, hdev); + } + + hci_update_background_scan(hdev); + + return 0; +} + +/* This function requires the caller holds hdev->lock */ +bool hci_is_adv_monitoring(struct hci_dev *hdev) +{ + return !idr_is_empty(&hdev->adv_monitors_idr); +} + struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { @@ -3023,6 +3120,20 @@ struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk( return NULL; } +struct bdaddr_list_with_flags * +hci_bdaddr_list_lookup_with_flags(struct list_head *bdaddr_list, + bdaddr_t *bdaddr, u8 type) +{ + struct bdaddr_list_with_flags *b; + + list_for_each_entry(b, bdaddr_list, list) { + if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) + return b; + } + + return NULL; +} + void hci_bdaddr_list_clear(struct list_head *bdaddr_list) { struct bdaddr_list *b, *n; @@ -3084,6 +3195,30 @@ int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, return 0; } +int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, + u8 type, u32 flags) +{ + struct bdaddr_list_with_flags *entry; + + if (!bacmp(bdaddr, BDADDR_ANY)) + return -EBADF; + + if (hci_bdaddr_list_lookup(list, bdaddr, type)) + return -EEXIST; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + bacpy(&entry->bdaddr, bdaddr); + entry->bdaddr_type = type; + entry->current_flags = flags; + + list_add(&entry->list, list); + + return 0; +} + int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; @@ -3123,6 +3258,26 @@ int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, return 0; } +int hci_bdaddr_list_del_with_flags(struct list_head *list, bdaddr_t *bdaddr, + u8 type) +{ + struct bdaddr_list_with_flags *entry; + + if (!bacmp(bdaddr, BDADDR_ANY)) { + hci_bdaddr_list_clear(list); + return 0; + } + + entry = hci_bdaddr_list_lookup_with_flags(list, bdaddr, type); + if (!entry) + return -ENOENT; + + list_del(&entry->list); + kfree(entry); + + return 0; +} + /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) @@ -3145,6 +3300,15 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, { struct hci_conn_params *param; + switch (addr_type) { + case ADDR_LE_DEV_PUBLIC_RESOLVED: + addr_type = ADDR_LE_DEV_PUBLIC; + break; + case ADDR_LE_DEV_RANDOM_RESOLVED: + addr_type = ADDR_LE_DEV_RANDOM; + break; + } + list_for_each_entry(param, list, action) { if (bacmp(¶m->addr, addr) == 0 && param->addr_type == addr_type) @@ -3289,10 +3453,10 @@ static int hci_suspend_wait_event(struct hci_dev *hdev) WAKE_COND, SUSPEND_NOTIFIER_TIMEOUT); if (ret == 0) { - bt_dev_dbg(hdev, "Timed out waiting for suspend"); + bt_dev_err(hdev, "Timed out waiting for suspend events"); for (i = 0; i < __SUSPEND_NUM_TASKS; ++i) { if (test_bit(i, hdev->suspend_tasks)) - bt_dev_dbg(hdev, "Bit %d is set", i); + bt_dev_err(hdev, "Suspend timeout bit: %d", i); clear_bit(i, hdev->suspend_tasks); } @@ -3360,12 +3524,15 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, ret = hci_change_suspend_state(hdev, BT_RUNNING); } - /* If suspend failed, restore it to running */ - if (ret && action == PM_SUSPEND_PREPARE) - hci_change_suspend_state(hdev, BT_RUNNING); - done: - return ret ? notifier_from_errno(-EBUSY) : NOTIFY_STOP; + /* We always allow suspend even if suspend preparation failed and + * attempt to recover in resume. + */ + if (ret) + bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", + action, ret); + + return NOTIFY_DONE; } /* Alloc HCI device */ @@ -3397,6 +3564,12 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_adv_max_interval = 0x0800; hdev->le_scan_interval = 0x0060; hdev->le_scan_window = 0x0030; + hdev->le_scan_int_suspend = 0x0400; + hdev->le_scan_window_suspend = 0x0012; + hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT; + hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN; + hdev->le_scan_int_connect = 0x0060; + hdev->le_scan_window_connect = 0x0060; hdev->le_conn_min_interval = 0x0018; hdev->le_conn_max_interval = 0x0028; hdev->le_conn_latency = 0x0000; @@ -3412,6 +3585,8 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES; + hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION; + hdev->def_le_autoconnect_timeout = HCI_LE_AUTOCONN_TIMEOUT; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; @@ -3420,13 +3595,17 @@ struct hci_dev *hci_alloc_dev(void) hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; hdev->min_enc_key_size = HCI_MIN_ENC_KEY_SIZE; + /* default 1.28 sec page scan */ + hdev->def_page_scan_type = PAGE_SCAN_TYPE_STANDARD; + hdev->def_page_scan_int = 0x0800; + hdev->def_page_scan_window = 0x0012; + mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); INIT_LIST_HEAD(&hdev->mgmt_pending); INIT_LIST_HEAD(&hdev->blacklist); INIT_LIST_HEAD(&hdev->whitelist); - INIT_LIST_HEAD(&hdev->wakeable); INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); @@ -3574,6 +3753,8 @@ int hci_register_dev(struct hci_dev *hdev) queue_work(hdev->req_workqueue, &hdev->power_on); + idr_init(&hdev->adv_monitors_idr); + return id; err_wqueue: @@ -3603,9 +3784,10 @@ void hci_unregister_dev(struct hci_dev *hdev) cancel_work_sync(&hdev->power_on); - hci_dev_do_close(hdev); - unregister_pm_notifier(&hdev->suspend_notifier); + cancel_work_sync(&hdev->suspend_prepare); + + hci_dev_do_close(hdev); if (!test_bit(HCI_INIT, &hdev->flags) && !hci_dev_test_flag(hdev, HCI_SETUP) && @@ -3644,6 +3826,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); hci_adv_instances_clear(hdev); + hci_adv_monitors_clear(hdev); hci_bdaddr_list_clear(&hdev->le_white_list); hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); @@ -4551,6 +4734,7 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) if (conn) { /* Send to upper protocol */ + bt_cb(skb)->sco.pkt_status = flags & 0x03; sco_recv_scodata(conn, skb); return; } else { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index af9d7f2ff8ba..4b7fc430793c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2299,6 +2299,22 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr, if (!conn) return; + /* When using controller based address resolution, then the new + * address types 0x02 and 0x03 are used. These types need to be + * converted back into either public address or random address type + */ + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) { + switch (own_address_type) { + case ADDR_LE_DEV_PUBLIC_RESOLVED: + own_address_type = ADDR_LE_DEV_PUBLIC; + break; + case ADDR_LE_DEV_RANDOM_RESOLVED: + own_address_type = ADDR_LE_DEV_RANDOM; + break; + } + } + /* Store the initiator and responder address information which * is needed for SMP. These values will not change during the * lifetime of the connection. @@ -2520,7 +2536,7 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s num_rsp %d", hdev->name, num_rsp); - if (!num_rsp) + if (!num_rsp || skb->len < num_rsp * sizeof(*info) + 1) return; if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) @@ -2700,10 +2716,10 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) */ if (hci_dev_test_flag(hdev, HCI_MGMT) && !hci_dev_test_flag(hdev, HCI_CONNECTABLE) && - !hci_bdaddr_list_lookup(&hdev->whitelist, &ev->bdaddr, - BDADDR_BREDR)) { - hci_reject_conn(hdev, &ev->bdaddr); - return; + !hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, &ev->bdaddr, + BDADDR_BREDR)) { + hci_reject_conn(hdev, &ev->bdaddr); + return; } /* Connection accepted */ @@ -2828,7 +2844,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) case HCI_AUTO_CONN_LINK_LOSS: if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT) break; - /* Fall through */ + fallthrough; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: @@ -3068,27 +3084,23 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); + /* Check link security requirements are met */ + if (!hci_conn_check_link_mode(conn)) + ev->status = HCI_ERROR_AUTH_FAILURE; + if (ev->status && conn->state == BT_CONNECTED) { if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING) set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); + /* Notify upper layers so they can cleanup before + * disconnecting. + */ + hci_encrypt_cfm(conn, ev->status); hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); hci_conn_drop(conn); goto unlock; } - /* In Secure Connections Only mode, do not allow any connections - * that are not encrypted with AES-CCM using a P-256 authenticated - * combination key. - */ - if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && - (!test_bit(HCI_CONN_AES_CCM, &conn->flags) || - conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) { - hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE); - hci_conn_drop(conn); - goto unlock; - } - /* Try reading the encryption key size for encrypted ACL links */ if (!ev->status && ev->encrypt && conn->type == ACL_LINK) { struct hci_cp_read_enc_key_size cp; @@ -4166,6 +4178,9 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct inquiry_info_with_rssi_and_pscan_mode *info; info = (void *) (skb->data + 1); + if (skb->len < num_rsp * sizeof(*info) + 1) + goto unlock; + for (; num_rsp; num_rsp--, info++) { u32 flags; @@ -4187,6 +4202,9 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, } else { struct inquiry_info_with_rssi *info = (void *) (skb->data + 1); + if (skb->len < num_rsp * sizeof(*info) + 1) + goto unlock; + for (; num_rsp; num_rsp--, info++) { u32 flags; @@ -4207,6 +4225,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, } } +unlock: hci_dev_unlock(hdev); } @@ -4327,7 +4346,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, if (hci_setup_sync(conn, conn->link->handle)) goto unlock; } - /* fall through */ + fallthrough; default: conn->state = BT_CLOSED; @@ -4382,7 +4401,7 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, BT_DBG("%s num_rsp %d", hdev->name, num_rsp); - if (!num_rsp) + if (!num_rsp || skb->len < num_rsp * sizeof(*info) + 1) return; if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) @@ -5212,6 +5231,11 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, le16_to_cpu(ev->interval), le16_to_cpu(ev->latency), le16_to_cpu(ev->supervision_timeout)); + + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && + hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) + hci_req_disable_address_resolution(hdev); } static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) @@ -5322,7 +5346,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, } conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW, - HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER, + hdev->def_le_autoconnect_timeout, HCI_ROLE_MASTER, direct_rpa); if (!IS_ERR(conn)) { /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned @@ -5456,14 +5480,15 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, /* Passive scanning shouldn't trigger any device found events, * except for devices marked as CONN_REPORT for which we do send - * device found events. + * device found events, or advertisement monitoring requested. */ if (hdev->le_scan_type == LE_SCAN_PASSIVE) { if (type == LE_ADV_DIRECT_IND) return; if (!hci_pend_le_action_lookup(&hdev->pend_le_reports, - bdaddr, bdaddr_type)) + bdaddr, bdaddr_type) && + idr_is_empty(&hdev->adv_monitors_idr)) return; if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND) diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 1fc55685da62..e0269192f2e5 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -34,9 +34,6 @@ #define HCI_REQ_PEND 1 #define HCI_REQ_CANCELED 2 -#define LE_SUSPEND_SCAN_WINDOW 0x0012 -#define LE_SUSPEND_SCAN_INTERVAL 0x0400 - void hci_req_init(struct hci_request *req, struct hci_dev *hdev) { skb_queue_head_init(&req->cmd_q); @@ -366,13 +363,11 @@ void __hci_req_write_fast_connectable(struct hci_request *req, bool enable) /* 160 msec page scan interval */ acp.interval = cpu_to_le16(0x0100); } else { - type = PAGE_SCAN_TYPE_STANDARD; /* default */ - - /* default 1.28 sec page scan */ - acp.interval = cpu_to_le16(0x0800); + type = hdev->def_page_scan_type; + acp.interval = cpu_to_le16(hdev->def_page_scan_int); } - acp.window = cpu_to_le16(0x0012); + acp.window = cpu_to_le16(hdev->def_page_scan_window); if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval || __cpu_to_le16(hdev->page_scan_window) != acp.window) @@ -418,18 +413,22 @@ static void __hci_update_background_scan(struct hci_request *req) */ hci_discovery_filter_clear(hdev); + BT_DBG("%s ADV monitoring is %s", hdev->name, + hci_is_adv_monitoring(hdev) ? "on" : "off"); + if (list_empty(&hdev->pend_le_conns) && - list_empty(&hdev->pend_le_reports)) { + list_empty(&hdev->pend_le_reports) && + !hci_is_adv_monitoring(hdev)) { /* If there is no pending LE connections or devices - * to be scanned for, we should stop the background - * scanning. + * to be scanned for or no ADV monitors, we should stop the + * background scanning. */ /* If controller is not scanning we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) return; - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); BT_DBG("%s stopping background scanning", hdev->name); } else { @@ -448,7 +447,7 @@ static void __hci_update_background_scan(struct hci_request *req) * don't miss any advertising (due to duplicates filter). */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); hci_req_add_le_passive_scan(req); @@ -653,7 +652,7 @@ void __hci_req_update_eir(struct hci_request *req) hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp); } -void hci_req_add_le_scan_disable(struct hci_request *req) +void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn) { struct hci_dev *hdev = req->hdev; @@ -676,6 +675,15 @@ void hci_req_add_le_scan_disable(struct hci_request *req) cp.enable = LE_SCAN_DISABLE; hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); } + + /* Disable address resolution */ + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && + hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION) && !rpa_le_conn) { + __u8 enable = 0x00; + + hci_req_add(req, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE, 1, &enable); + } } static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr, @@ -689,6 +697,21 @@ static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr, bt_dev_dbg(req->hdev, "Remove %pMR (0x%x) from whitelist", &cp.bdaddr, cp.bdaddr_type); hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, sizeof(cp), &cp); + + if (use_ll_privacy(req->hdev)) { + struct smp_irk *irk; + + irk = hci_find_irk_by_addr(req->hdev, bdaddr, bdaddr_type); + if (irk) { + struct hci_cp_le_del_from_resolv_list cp; + + cp.bdaddr_type = bdaddr_type; + bacpy(&cp.bdaddr, bdaddr); + + hci_req_add(req, HCI_OP_LE_DEL_FROM_RESOLV_LIST, + sizeof(cp), &cp); + } + } } /* Adds connection to white list if needed. On error, returns -1. */ @@ -709,13 +732,14 @@ static int add_to_white_list(struct hci_request *req, return -1; /* White list can not be used with RPAs */ - if (!allow_rpa && + if (!allow_rpa && !use_ll_privacy(hdev) && hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) { return -1; } /* During suspend, only wakeable devices can be in whitelist */ - if (hdev->suspended && !params->wakeable) + if (hdev->suspended && !hci_conn_test_flag(HCI_CONN_FLAG_REMOTE_WAKEUP, + params->current_flags)) return 0; *num_entries += 1; @@ -726,6 +750,28 @@ static int add_to_white_list(struct hci_request *req, cp.bdaddr_type); hci_req_add(req, HCI_OP_LE_ADD_TO_WHITE_LIST, sizeof(cp), &cp); + if (use_ll_privacy(hdev)) { + struct smp_irk *irk; + + irk = hci_find_irk_by_addr(hdev, ¶ms->addr, + params->addr_type); + if (irk) { + struct hci_cp_le_add_to_resolv_list cp; + + cp.bdaddr_type = params->addr_type; + bacpy(&cp.bdaddr, ¶ms->addr); + memcpy(cp.peer_irk, irk->val, 16); + + if (hci_dev_test_flag(hdev, HCI_PRIVACY)) + memcpy(cp.local_irk, hdev->irk, 16); + else + memset(cp.local_irk, 0, 16); + + hci_req_add(req, HCI_OP_LE_ADD_TO_RESOLV_LIST, + sizeof(cp), &cp); + } + } + return 0; } @@ -766,7 +812,7 @@ static u8 update_white_list(struct hci_request *req) } /* White list can not be used with RPAs */ - if (!allow_rpa && + if (!allow_rpa && !use_ll_privacy(hdev) && hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) { return 0x00; } @@ -798,6 +844,14 @@ static u8 update_white_list(struct hci_request *req) return 0x00; } + /* Once the controller offloading of advertisement monitor is in place, + * the if condition should include the support of MSFT extension + * support. If suspend is ongoing, whitelist should be the default to + * prevent waking by random advertisements. + */ + if (!idr_is_empty(&hdev->adv_monitors_idr) && !hdev->suspended) + return 0x00; + /* Select filter policy to use white list */ return 0x01; } @@ -808,10 +862,24 @@ static bool scan_use_rpa(struct hci_dev *hdev) } static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, - u16 window, u8 own_addr_type, u8 filter_policy) + u16 window, u8 own_addr_type, u8 filter_policy, + bool addr_resolv) { struct hci_dev *hdev = req->hdev; + if (hdev->scanning_paused) { + bt_dev_dbg(hdev, "Scanning is paused for suspend"); + return; + } + + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && + addr_resolv) { + u8 enable = 0x01; + + hci_req_add(req, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE, 1, &enable); + } + /* Use ext scanning if set ext scan param and ext scan enable is * supported */ @@ -885,12 +953,39 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, } } +/* Returns true if an le connection is in the scanning state */ +static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == LE_LINK && c->state == BT_CONNECT && + test_bit(HCI_CONN_SCANNING, &c->flags)) { + rcu_read_unlock(); + return true; + } + } + + rcu_read_unlock(); + + return false; +} + +/* Ensure to call hci_req_add_le_scan_disable() first to disable the + * controller based address resolution to be able to reconfigure + * resolving list. + */ void hci_req_add_le_passive_scan(struct hci_request *req) { struct hci_dev *hdev = req->hdev; u8 own_addr_type; u8 filter_policy; u16 window, interval; + /* Background scanning should run with address resolution */ + bool addr_resolv = true; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); @@ -927,8 +1022,11 @@ void hci_req_add_le_passive_scan(struct hci_request *req) filter_policy |= 0x02; if (hdev->suspended) { - window = LE_SUSPEND_SCAN_WINDOW; - interval = LE_SUSPEND_SCAN_INTERVAL; + window = hdev->le_scan_window_suspend; + interval = hdev->le_scan_int_suspend; + } else if (hci_is_le_conn_scanning(hdev)) { + window = hdev->le_scan_window_connect; + interval = hdev->le_scan_int_connect; } else { window = hdev->le_scan_window; interval = hdev->le_scan_interval; @@ -936,7 +1034,7 @@ void hci_req_add_le_passive_scan(struct hci_request *req) bt_dev_dbg(hdev, "LE passive scan with whitelist = %d", filter_policy); hci_req_start_scan(req, LE_SCAN_PASSIVE, interval, window, - own_addr_type, filter_policy); + own_addr_type, filter_policy, addr_resolv); } static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) @@ -973,15 +1071,19 @@ static void hci_req_clear_event_filter(struct hci_request *req) static void hci_req_set_event_filter(struct hci_request *req) { - struct bdaddr_list *b; + struct bdaddr_list_with_flags *b; struct hci_cp_set_event_filter f; struct hci_dev *hdev = req->hdev; - u8 scan; + u8 scan = SCAN_DISABLED; /* Always clear event filter when starting */ hci_req_clear_event_filter(req); - list_for_each_entry(b, &hdev->wakeable, list) { + list_for_each_entry(b, &hdev->whitelist, list) { + if (!hci_conn_test_flag(HCI_CONN_FLAG_REMOTE_WAKEUP, + b->current_flags)) + continue; + memset(&f, 0, sizeof(f)); bacpy(&f.addr_conn_flt.bdaddr, &b->bdaddr); f.flt_type = HCI_FLT_CONN_SETUP; @@ -990,16 +1092,17 @@ static void hci_req_set_event_filter(struct hci_request *req) bt_dev_dbg(hdev, "Adding event filters for %pMR", &b->bdaddr); hci_req_add(req, HCI_OP_SET_EVENT_FLT, sizeof(f), &f); + scan = SCAN_PAGE; } - scan = !list_empty(&hdev->wakeable) ? SCAN_PAGE : SCAN_DISABLED; hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } static void hci_req_config_le_suspend_scan(struct hci_request *req) { - /* Can't change params without disabling first */ - hci_req_add_le_scan_disable(req); + /* Before changing params disable scan if enabled */ + if (hci_dev_test_flag(req->hdev, HCI_LE_SCAN)) + hci_req_add_le_scan_disable(req, false); /* Configure params and enable scanning */ hci_req_add_le_passive_scan(req); @@ -1065,8 +1168,9 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) page_scan = SCAN_DISABLED; hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &page_scan); - /* Disable LE passive scan */ - hci_req_add_le_scan_disable(&req); + /* Disable LE passive scan if enabled */ + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) + hci_req_add_le_scan_disable(&req, false); /* Mark task needing completion */ set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); @@ -1160,13 +1264,8 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) void __hci_req_disable_advertising(struct hci_request *req) { if (ext_adv_capable(req->hdev)) { - struct hci_cp_le_set_ext_adv_enable cp; - - cp.enable = 0x00; - /* Disable all sets since we only support one set at the moment */ - cp.num_of_sets = 0x00; + __hci_req_disable_ext_adv_instance(req, 0x00); - hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp), &cp); } else { u8 enable = 0x00; @@ -1627,6 +1726,28 @@ int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance) return hci_req_run(&req, NULL); } +static void enable_addr_resolution_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + BT_DBG("%s status %u", hdev->name, status); +} + +void hci_req_disable_address_resolution(struct hci_dev *hdev) +{ + struct hci_request req; + __u8 enable = 0x00; + + if (!use_ll_privacy(hdev) && + !hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) + return; + + hci_req_init(&req, hdev); + + hci_req_add(&req, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE, 1, &enable); + + hci_req_run(&req, enable_addr_resolution_complete); +} + static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) { BT_DBG("%s status %u", hdev->name, status); @@ -1786,8 +1907,6 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) int err; struct adv_info *adv_instance; bool secondary_adv; - /* In ext adv set param interval is 3 octets */ - const u8 adv_interval[3] = { 0x00, 0x08, 0x00 }; if (instance > 0) { adv_instance = hci_find_adv_instance(hdev, instance); @@ -1820,8 +1939,9 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) memset(&cp, 0, sizeof(cp)); - memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval)); - memcpy(cp.max_interval, adv_interval, sizeof(cp.max_interval)); + /* In ext adv set param interval is 3 octets */ + hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval); + hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval); secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK); @@ -1932,13 +2052,59 @@ int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance) return 0; } +int __hci_req_disable_ext_adv_instance(struct hci_request *req, u8 instance) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_le_set_ext_adv_enable *cp; + struct hci_cp_ext_adv_set *adv_set; + u8 data[sizeof(*cp) + sizeof(*adv_set) * 1]; + u8 req_size; + + /* If request specifies an instance that doesn't exist, fail */ + if (instance > 0 && !hci_find_adv_instance(hdev, instance)) + return -EINVAL; + + memset(data, 0, sizeof(data)); + + cp = (void *)data; + adv_set = (void *)cp->data; + + /* Instance 0x00 indicates all advertising instances will be disabled */ + cp->num_of_sets = !!instance; + cp->enable = 0x00; + + adv_set->handle = instance; + + req_size = sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets; + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, req_size, data); + + return 0; +} + +int __hci_req_remove_ext_adv_instance(struct hci_request *req, u8 instance) +{ + struct hci_dev *hdev = req->hdev; + + /* If request specifies an instance that doesn't exist, fail */ + if (instance > 0 && !hci_find_adv_instance(hdev, instance)) + return -EINVAL; + + hci_req_add(req, HCI_OP_LE_REMOVE_ADV_SET, sizeof(instance), &instance); + + return 0; +} + int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; + struct adv_info *adv_instance = hci_find_adv_instance(hdev, instance); int err; - if (hci_dev_test_flag(hdev, HCI_LE_ADV)) - __hci_req_disable_advertising(req); + /* If instance isn't pending, the chip knows about it, and it's safe to + * disable + */ + if (adv_instance && !adv_instance->pending) + __hci_req_disable_ext_adv_instance(req, instance); err = __hci_req_setup_ext_adv_instance(req, instance); if (err < 0) @@ -2086,7 +2252,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, hci_dev_test_flag(hdev, HCI_ADVERTISING)) return; - if (next_instance) + if (next_instance && !ext_adv_capable(hdev)) __hci_req_schedule_adv_instance(req, next_instance->instance, false); } @@ -2128,7 +2294,13 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, if (use_rpa) { int to; - *own_addr_type = ADDR_LE_DEV_RANDOM; + /* If Controller supports LL Privacy use own address type is + * 0x03 + */ + if (use_ll_privacy(hdev)) + *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; + else + *own_addr_type = ADDR_LE_DEV_RANDOM; if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) && !bacmp(&hdev->random_addr, &hdev->rpa)) @@ -2547,7 +2719,7 @@ static void bg_scan_update(struct work_struct *work) static int le_scan_disable(struct hci_request *req, unsigned long opt) { - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); return 0; } @@ -2645,7 +2817,12 @@ static int le_scan_restart(struct hci_request *req, unsigned long opt) if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) return 0; - hci_req_add_le_scan_disable(req); + if (hdev->scanning_paused) { + bt_dev_dbg(hdev, "Scanning is paused for suspend"); + return 0; + } + + hci_req_add_le_scan_disable(req, false); if (use_ext_scan(hdev)) { struct hci_cp_le_set_ext_scan_enable ext_enable_cp; @@ -2725,6 +2902,8 @@ static int active_scan(struct hci_request *req, unsigned long opt) u8 own_addr_type; /* White list is not used for discovery */ u8 filter_policy = 0x00; + /* Discovery doesn't require controller address resolution */ + bool addr_resolv = false; int err; BT_DBG("%s", hdev->name); @@ -2734,7 +2913,7 @@ static int active_scan(struct hci_request *req, unsigned long opt) * discovery scanning parameters. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); /* All active scans will be done with either a resolvable private * address (when privacy feature has been enabled) or non-resolvable @@ -2745,8 +2924,9 @@ static int active_scan(struct hci_request *req, unsigned long opt) if (err < 0) own_addr_type = ADDR_LE_DEV_PUBLIC; - hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, DISCOV_LE_SCAN_WIN, - own_addr_type, filter_policy); + hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, + hdev->le_scan_window_discovery, own_addr_type, + filter_policy, addr_resolv); return 0; } @@ -2793,18 +2973,18 @@ static void start_discovery(struct hci_dev *hdev, u8 *status) * to do BR/EDR inquiry. */ hci_req_sync(hdev, interleaved_discov, - DISCOV_LE_SCAN_INT * 2, HCI_CMD_TIMEOUT, + hdev->le_scan_int_discovery * 2, HCI_CMD_TIMEOUT, status); break; } timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout); - hci_req_sync(hdev, active_scan, DISCOV_LE_SCAN_INT, + hci_req_sync(hdev, active_scan, hdev->le_scan_int_discovery, HCI_CMD_TIMEOUT, status); break; case DISCOV_TYPE_LE: timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); - hci_req_sync(hdev, active_scan, DISCOV_LE_SCAN_INT, + hci_req_sync(hdev, active_scan, hdev->le_scan_int_discovery, HCI_CMD_TIMEOUT, status); break; default: @@ -2848,14 +3028,14 @@ bool hci_req_stop_discovery(struct hci_request *req) if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { cancel_delayed_work(&hdev->le_scan_disable); - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); } ret = true; } else { /* Passive scanning */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { - hci_req_add_le_scan_disable(req); + hci_req_add_le_scan_disable(req, false); ret = true; } } diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 0e81614d235e..6a12e84c66c4 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -65,11 +65,12 @@ void __hci_req_write_fast_connectable(struct hci_request *req, bool enable); void __hci_req_update_name(struct hci_request *req); void __hci_req_update_eir(struct hci_request *req); -void hci_req_add_le_scan_disable(struct hci_request *req); +void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn); void hci_req_add_le_passive_scan(struct hci_request *req); void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next); +void hci_req_disable_address_resolution(struct hci_dev *hdev); void hci_req_reenable_advertising(struct hci_dev *hdev); void __hci_req_enable_advertising(struct hci_request *req); void __hci_req_disable_advertising(struct hci_request *req); @@ -86,6 +87,8 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance); int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance); +int __hci_req_disable_ext_adv_instance(struct hci_request *req, u8 instance); +int __hci_req_remove_ext_adv_instance(struct hci_request *req, u8 instance); void __hci_req_clear_ext_adv_sets(struct hci_request *req); int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, bool use_rpa, struct adv_info *adv_instance, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index caf38a8ea6a8..251b9128f530 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -52,7 +52,7 @@ struct hci_pinfo { struct bt_sock bt; struct hci_dev *hdev; struct hci_filter filter; - __u32 cmsg_mask; + __u8 cmsg_mask; unsigned short channel; unsigned long flags; __u32 cookie; @@ -443,8 +443,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) case HCI_DEV_SETUP: if (hdev->manufacturer == 0xffff) return NULL; - - /* fall through */ + fallthrough; case HCI_DEV_UP: skb = bt_skb_alloc(HCI_MON_INDEX_INFO_SIZE, GFP_ATOMIC); @@ -1399,7 +1398,7 @@ done: static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { - __u32 mask = hci_pi(sk)->cmsg_mask; + __u8 mask = hci_pi(sk)->cmsg_mask; if (mask & HCI_CMSG_DIR) { int incoming = bt_cb(skb)->incoming; @@ -1842,7 +1841,7 @@ drop: } static int hci_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int len) + sockptr_t optval, unsigned int len) { struct hci_ufilter uf = { .opcode = 0 }; struct sock *sk = sock->sk; @@ -1862,7 +1861,7 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case HCI_DATA_DIR: - if (get_user(opt, (int __user *)optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(opt))) { err = -EFAULT; break; } @@ -1874,7 +1873,7 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, break; case HCI_TIME_STAMP: - if (get_user(opt, (int __user *)optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(opt))) { err = -EFAULT; break; } @@ -1896,7 +1895,7 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, } len = min_t(unsigned int, len, sizeof(uf)); - if (copy_from_user(&uf, optval, len)) { + if (copy_from_sockptr(&uf, optval, len)) { err = -EFAULT; break; } diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 03be6a4baef3..595fb3c9d6c3 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -233,8 +233,6 @@ static const struct proto_ops hidp_sock_ops = { .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index fe913a5c754a..ade83e224567 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -666,8 +666,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) l2cap_seq_list_free(&chan->srej_list); l2cap_seq_list_free(&chan->retrans_list); - - /* fall through */ + fallthrough; case L2CAP_MODE_STREAMING: skb_queue_purge(&chan->tx_q); @@ -872,7 +871,8 @@ static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan) else return HCI_AT_NO_BONDING; } - /* fall through */ + fallthrough; + default: switch (chan->sec_level) { case BT_SECURITY_HIGH: @@ -2983,8 +2983,7 @@ static void l2cap_tx_state_wait_f(struct l2cap_chan *chan, break; case L2CAP_EV_RECV_REQSEQ_AND_FBIT: l2cap_process_reqseq(chan, control->reqseq); - - /* Fall through */ + fallthrough; case L2CAP_EV_RECV_FBIT: if (control && control->final) { @@ -3311,7 +3310,7 @@ static inline __u8 l2cap_select_mode(__u8 mode, __u16 remote_feat_mask) case L2CAP_MODE_ERTM: if (l2cap_mode_supported(mode, remote_feat_mask)) return mode; - /* fall through */ + fallthrough; default: return L2CAP_MODE_BASIC; } @@ -3447,7 +3446,7 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data if (__l2cap_efs_supported(chan->conn)) set_bit(FLAG_EFS_ENABLE, &chan->flags); - /* fall through */ + fallthrough; default: chan->mode = l2cap_select_mode(rfc.mode, chan->conn->feat_mask); break; @@ -4539,7 +4538,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, goto done; break; } - /* fall through */ + fallthrough; default: l2cap_chan_set_err(chan, ECONNRESET); @@ -7719,7 +7718,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) conn->mtu = hcon->hdev->le_mtu; break; } - /* fall through */ + fallthrough; default: conn->mtu = hcon->hdev->acl_mtu; break; @@ -7841,7 +7840,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, case L2CAP_MODE_STREAMING: if (!disable_ertm) break; - /* fall through */ + fallthrough; default: err = -EOPNOTSUPP; goto done; @@ -7893,11 +7892,13 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, else hcon = hci_connect_le_scan(hdev, dst, dst_type, chan->sec_level, - HCI_LE_CONN_TIMEOUT); + HCI_LE_CONN_TIMEOUT, + CONN_REASON_L2CAP_CHAN); } else { u8 auth_type = l2cap_get_auth_type(chan); - hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type); + hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type, + CONN_REASON_L2CAP_CHAN); } if (IS_ERR(hcon)) { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index a995d2c51fa7..e1a3e66b1754 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -284,7 +284,7 @@ static int l2cap_sock_listen(struct socket *sock, int backlog) case L2CAP_MODE_STREAMING: if (!disable_ertm) break; - /* fall through */ + fallthrough; default: err = -EOPNOTSUPP; goto done; @@ -703,7 +703,7 @@ static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu) } static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; @@ -736,7 +736,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, opts.txwin_size = chan->tx_win; len = min_t(unsigned int, sizeof(opts), optlen); - if (copy_from_user((char *) &opts, optval, len)) { + if (copy_from_sockptr(&opts, optval, len)) { err = -EFAULT; break; } @@ -760,7 +760,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, case L2CAP_MODE_STREAMING: if (!disable_ertm) break; - /* fall through */ + fallthrough; default: err = -EINVAL; break; @@ -782,7 +782,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; case L2CAP_LM: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -859,7 +859,7 @@ static int l2cap_set_mode(struct l2cap_chan *chan, u8 mode) } static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; @@ -891,7 +891,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_user((char *) &sec, optval, len)) { + if (copy_from_sockptr(&sec, optval, len)) { err = -EFAULT; break; } @@ -939,7 +939,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -954,7 +954,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_FLUSHABLE: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -990,7 +990,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, pwr.force_active = BT_POWER_FORCE_ACTIVE_ON; len = min_t(unsigned int, sizeof(pwr), optlen); - if (copy_from_user((char *) &pwr, optval, len)) { + if (copy_from_sockptr(&pwr, optval, len)) { err = -EFAULT; break; } @@ -1002,7 +1002,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_CHANNEL_POLICY: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -1050,7 +1050,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u16 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u16))) { err = -EFAULT; break; } @@ -1081,7 +1081,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u8 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u8))) { err = -EFAULT; break; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 9e8a3cccc6ca..5bbe71002fb9 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -36,9 +36,11 @@ #include "hci_request.h" #include "smp.h" #include "mgmt_util.h" +#include "mgmt_config.h" +#include "msft.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 17 +#define MGMT_REVISION 18 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -111,6 +113,15 @@ static const u16 mgmt_commands[] = { MGMT_OP_READ_SECURITY_INFO, MGMT_OP_READ_EXP_FEATURES_INFO, MGMT_OP_SET_EXP_FEATURE, + MGMT_OP_READ_DEF_SYSTEM_CONFIG, + MGMT_OP_SET_DEF_SYSTEM_CONFIG, + MGMT_OP_READ_DEF_RUNTIME_CONFIG, + MGMT_OP_SET_DEF_RUNTIME_CONFIG, + MGMT_OP_GET_DEVICE_FLAGS, + MGMT_OP_SET_DEVICE_FLAGS, + MGMT_OP_READ_ADV_MONITOR_FEATURES, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_OP_REMOVE_ADV_MONITOR, }; static const u16 mgmt_events[] = { @@ -151,6 +162,7 @@ static const u16 mgmt_events[] = { MGMT_EV_EXT_INFO_CHANGED, MGMT_EV_PHY_CONFIGURATION_CHANGED, MGMT_EV_EXP_FEATURE_CHANGED, + MGMT_EV_DEVICE_FLAGS_CHANGED, }; static const u16 mgmt_untrusted_commands[] = { @@ -162,6 +174,8 @@ static const u16 mgmt_untrusted_commands[] = { MGMT_OP_READ_EXT_INFO, MGMT_OP_READ_SECURITY_INFO, MGMT_OP_READ_EXP_FEATURES_INFO, + MGMT_OP_READ_DEF_SYSTEM_CONFIG, + MGMT_OP_READ_DEF_RUNTIME_CONFIG, }; static const u16 mgmt_untrusted_events[] = { @@ -177,6 +191,8 @@ static const u16 mgmt_untrusted_events[] = { MGMT_EV_EXT_INDEX_REMOVED, MGMT_EV_EXT_INFO_CHANGED, MGMT_EV_EXP_FEATURE_CHANGED, + MGMT_EV_ADV_MONITOR_ADDED, + MGMT_EV_ADV_MONITOR_REMOVED, }; #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) @@ -779,10 +795,15 @@ static u32 get_supported_settings(struct hci_dev *hdev) if (lmp_le_capable(hdev)) { settings |= MGMT_SETTING_LE; - settings |= MGMT_SETTING_ADVERTISING; settings |= MGMT_SETTING_SECURE_CONN; settings |= MGMT_SETTING_PRIVACY; settings |= MGMT_SETTING_STATIC_ADDRESS; + + /* When the experimental feature for LL Privacy support is + * enabled, then advertising is no longer supported. + */ + if (!hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + settings |= MGMT_SETTING_ADVERTISING; } if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || @@ -2915,7 +2936,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, if (cp->addr.type == BDADDR_BREDR) { conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level, - auth_type); + auth_type, CONN_REASON_PAIR_DEVICE); } else { u8 addr_type = le_addr_type(cp->addr.type); struct hci_conn_params *p; @@ -2934,9 +2955,9 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, if (p->auto_connect == HCI_AUTO_CONN_EXPLICIT) p->auto_connect = HCI_AUTO_CONN_DISABLED; - conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr, - addr_type, sec_level, - HCI_LE_CONN_TIMEOUT); + conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr, addr_type, + sec_level, HCI_LE_CONN_TIMEOUT, + CONN_REASON_PAIR_DEVICE); } if (IS_ERR(conn)) { @@ -3037,6 +3058,20 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, 0, addr, sizeof(*addr)); + + /* Since user doesn't want to proceed with the connection, abort any + * ongoing pairing and then terminate the link if it was created + * because of the pair device action. + */ + if (addr->type == BDADDR_BREDR) + hci_remove_link_key(hdev, &addr->bdaddr); + else + smp_cancel_and_remove_pairing(hdev, &addr->bdaddr, + le_addr_type(addr->type)); + + if (conn->conn_reason == CONN_REASON_PAIR_DEVICE) + hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); + unlock: hci_dev_unlock(hdev); return err; @@ -3723,12 +3758,25 @@ static const u8 debug_uuid[16] = { }; #endif +/* 671b10b5-42c0-4696-9227-eb28d1b049d6 */ +static const u8 simult_central_periph_uuid[16] = { + 0xd6, 0x49, 0xb0, 0xd1, 0x28, 0xeb, 0x27, 0x92, + 0x96, 0x46, 0xc0, 0x42, 0xb5, 0x10, 0x1b, 0x67, +}; + +/* 15c0a148-c273-11ea-b3de-0242ac130004 */ +static const u8 rpa_resolution_uuid[16] = { + 0x04, 0x00, 0x13, 0xac, 0x42, 0x02, 0xde, 0xb3, + 0xea, 0x11, 0x73, 0xc2, 0x48, 0xa1, 0xc0, 0x15, +}; + static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { - char buf[42]; + char buf[62]; /* Enough space for 3 features */ struct mgmt_rp_read_exp_features_info *rp = (void *)buf; u16 idx = 0; + u32 flags; bt_dev_dbg(hdev, "sock %p", sk); @@ -3736,7 +3784,7 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, #ifdef CONFIG_BT_FEATURE_DEBUG if (!hdev) { - u32 flags = bt_dbg_get() ? BIT(0) : 0; + flags = bt_dbg_get() ? BIT(0) : 0; memcpy(rp->features[idx].uuid, debug_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); @@ -3744,6 +3792,31 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, } #endif + if (hdev) { + if (test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) && + (hdev->le_states[4] & 0x08) && /* Central */ + (hdev->le_states[4] & 0x40) && /* Peripheral */ + (hdev->le_states[3] & 0x10)) /* Simultaneous */ + flags = BIT(0); + else + flags = 0; + + memcpy(rp->features[idx].uuid, simult_central_periph_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; + } + + if (hdev && use_ll_privacy(hdev)) { + if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + flags = BIT(0) | BIT(1); + else + flags = BIT(1); + + memcpy(rp->features[idx].uuid, rpa_resolution_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; + } + rp->feature_count = cpu_to_le16(idx); /* After reading the experimental features information, enable @@ -3756,6 +3829,21 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, 0, rp, sizeof(*rp) + (20 * idx)); } +static int exp_ll_privacy_feature_changed(bool enabled, struct hci_dev *hdev, + struct sock *skip) +{ + struct mgmt_ev_exp_feature_changed ev; + + memset(&ev, 0, sizeof(ev)); + memcpy(ev.uuid, rpa_resolution_uuid, 16); + ev.flags = cpu_to_le32((enabled ? BIT(0) : 0) | BIT(1)); + + return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, hdev, + &ev, sizeof(ev), + HCI_MGMT_EXP_FEATURE_EVENTS, skip); + +} + #ifdef CONFIG_BT_FEATURE_DEBUG static int exp_debug_feature_changed(bool enabled, struct sock *skip) { @@ -3794,6 +3882,16 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, } #endif + if (hdev && use_ll_privacy(hdev) && !hdev_is_powered(hdev)) { + bool changed = hci_dev_test_flag(hdev, + HCI_ENABLE_LL_PRIVACY); + + hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); + + if (changed) + exp_ll_privacy_feature_changed(false, hdev, sk); + } + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, @@ -3844,11 +3942,401 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, } #endif + if (!memcmp(cp->uuid, rpa_resolution_uuid, 16)) { + bool val, changed; + int err; + u32 flags; + + /* Command requires to use the controller index */ + if (!hdev) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); + + /* Changes can only be made when controller is powered down */ + if (hdev_is_powered(hdev)) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_NOT_POWERED); + + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + val = !!cp->param[0]; + + if (val) { + changed = !hci_dev_test_flag(hdev, + HCI_ENABLE_LL_PRIVACY); + hci_dev_set_flag(hdev, HCI_ENABLE_LL_PRIVACY); + hci_dev_clear_flag(hdev, HCI_ADVERTISING); + + /* Enable LL privacy + supported settings changed */ + flags = BIT(0) | BIT(1); + } else { + changed = hci_dev_test_flag(hdev, + HCI_ENABLE_LL_PRIVACY); + hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); + + /* Disable LL privacy + supported settings changed */ + flags = BIT(1); + } + + memcpy(rp.uuid, rpa_resolution_uuid, 16); + rp.flags = cpu_to_le32(flags); + + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); + + if (changed) + exp_ll_privacy_feature_changed(val, hdev, sk); + + return err; + } + return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_NOT_SUPPORTED); } +#define SUPPORTED_DEVICE_FLAGS() ((1U << HCI_CONN_FLAG_MAX) - 1) + +static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) +{ + struct mgmt_cp_get_device_flags *cp = data; + struct mgmt_rp_get_device_flags rp; + struct bdaddr_list_with_flags *br_params; + struct hci_conn_params *params; + u32 supported_flags = SUPPORTED_DEVICE_FLAGS(); + u32 current_flags = 0; + u8 status = MGMT_STATUS_INVALID_PARAMS; + + bt_dev_dbg(hdev, "Get device flags %pMR (type 0x%x)\n", + &cp->addr.bdaddr, cp->addr.type); + + hci_dev_lock(hdev); + + if (cp->addr.type == BDADDR_BREDR) { + br_params = hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, + &cp->addr.bdaddr, + cp->addr.type); + if (!br_params) + goto done; + + current_flags = br_params->current_flags; + } else { + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + + if (!params) + goto done; + + current_flags = params->current_flags; + } + + bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); + rp.addr.type = cp->addr.type; + rp.supported_flags = cpu_to_le32(supported_flags); + rp.current_flags = cpu_to_le32(current_flags); + + status = MGMT_STATUS_SUCCESS; + +done: + hci_dev_unlock(hdev); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_DEVICE_FLAGS, status, + &rp, sizeof(rp)); +} + +static void device_flags_changed(struct sock *sk, struct hci_dev *hdev, + bdaddr_t *bdaddr, u8 bdaddr_type, + u32 supported_flags, u32 current_flags) +{ + struct mgmt_ev_device_flags_changed ev; + + bacpy(&ev.addr.bdaddr, bdaddr); + ev.addr.type = bdaddr_type; + ev.supported_flags = cpu_to_le32(supported_flags); + ev.current_flags = cpu_to_le32(current_flags); + + mgmt_event(MGMT_EV_DEVICE_FLAGS_CHANGED, hdev, &ev, sizeof(ev), sk); +} + +static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, + u16 len) +{ + struct mgmt_cp_set_device_flags *cp = data; + struct bdaddr_list_with_flags *br_params; + struct hci_conn_params *params; + u8 status = MGMT_STATUS_INVALID_PARAMS; + u32 supported_flags = SUPPORTED_DEVICE_FLAGS(); + u32 current_flags = __le32_to_cpu(cp->current_flags); + + bt_dev_dbg(hdev, "Set device flags %pMR (type 0x%x) = 0x%x", + &cp->addr.bdaddr, cp->addr.type, + __le32_to_cpu(current_flags)); + + if ((supported_flags | current_flags) != supported_flags) { + bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)", + current_flags, supported_flags); + goto done; + } + + hci_dev_lock(hdev); + + if (cp->addr.type == BDADDR_BREDR) { + br_params = hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, + &cp->addr.bdaddr, + cp->addr.type); + + if (br_params) { + br_params->current_flags = current_flags; + status = MGMT_STATUS_SUCCESS; + } else { + bt_dev_warn(hdev, "No such BR/EDR device %pMR (0x%x)", + &cp->addr.bdaddr, cp->addr.type); + } + } else { + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + if (params) { + params->current_flags = current_flags; + status = MGMT_STATUS_SUCCESS; + } else { + bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", + &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + } + } + +done: + hci_dev_unlock(hdev); + + if (status == MGMT_STATUS_SUCCESS) + device_flags_changed(sk, hdev, &cp->addr.bdaddr, cp->addr.type, + supported_flags, current_flags); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_FLAGS, status, + &cp->addr, sizeof(cp->addr)); +} + +static void mgmt_adv_monitor_added(struct sock *sk, struct hci_dev *hdev, + u16 handle) +{ + struct mgmt_ev_adv_monitor_added ev; + + ev.monitor_handle = cpu_to_le16(handle); + + mgmt_event(MGMT_EV_ADV_MONITOR_ADDED, hdev, &ev, sizeof(ev), sk); +} + +static void mgmt_adv_monitor_removed(struct sock *sk, struct hci_dev *hdev, + u16 handle) +{ + struct mgmt_ev_adv_monitor_added ev; + + ev.monitor_handle = cpu_to_le16(handle); + + mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk); +} + +static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct adv_monitor *monitor = NULL; + struct mgmt_rp_read_adv_monitor_features *rp = NULL; + int handle; + size_t rp_size = 0; + __u32 supported = 0; + __u16 num_handles = 0; + __u16 handles[HCI_MAX_ADV_MONITOR_NUM_HANDLES]; + + BT_DBG("request for %s", hdev->name); + + hci_dev_lock(hdev); + + if (msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR) + supported |= MGMT_ADV_MONITOR_FEATURE_MASK_OR_PATTERNS; + + idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) { + handles[num_handles++] = monitor->handle; + } + + hci_dev_unlock(hdev); + + rp_size = sizeof(*rp) + (num_handles * sizeof(u16)); + rp = kmalloc(rp_size, GFP_KERNEL); + if (!rp) + return -ENOMEM; + + /* Once controller-based monitoring is in place, the enabled_features + * should reflect the use. + */ + rp->supported_features = cpu_to_le32(supported); + rp->enabled_features = 0; + rp->max_num_handles = cpu_to_le16(HCI_MAX_ADV_MONITOR_NUM_HANDLES); + rp->max_num_patterns = HCI_MAX_ADV_MONITOR_NUM_PATTERNS; + rp->num_handles = cpu_to_le16(num_handles); + if (num_handles) + memcpy(&rp->handles, &handles, (num_handles * sizeof(u16))); + + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_READ_ADV_MONITOR_FEATURES, + MGMT_STATUS_SUCCESS, rp, rp_size); +} + +static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_add_adv_patterns_monitor *cp = data; + struct mgmt_rp_add_adv_patterns_monitor rp; + struct adv_monitor *m = NULL; + struct adv_pattern *p = NULL; + unsigned int mp_cnt = 0, prev_adv_monitors_cnt; + __u8 cp_ofst = 0, cp_len = 0; + int err, i; + + BT_DBG("request for %s", hdev->name); + + if (len <= sizeof(*cp) || cp->pattern_count == 0) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_INVALID_PARAMS); + goto failed; + } + + m = kmalloc(sizeof(*m), GFP_KERNEL); + if (!m) { + err = -ENOMEM; + goto failed; + } + + INIT_LIST_HEAD(&m->patterns); + m->active = false; + + for (i = 0; i < cp->pattern_count; i++) { + if (++mp_cnt > HCI_MAX_ADV_MONITOR_NUM_PATTERNS) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_INVALID_PARAMS); + goto failed; + } + + cp_ofst = cp->patterns[i].offset; + cp_len = cp->patterns[i].length; + if (cp_ofst >= HCI_MAX_AD_LENGTH || + cp_len > HCI_MAX_AD_LENGTH || + (cp_ofst + cp_len) > HCI_MAX_AD_LENGTH) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_INVALID_PARAMS); + goto failed; + } + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + err = -ENOMEM; + goto failed; + } + + p->ad_type = cp->patterns[i].ad_type; + p->offset = cp->patterns[i].offset; + p->length = cp->patterns[i].length; + memcpy(p->value, cp->patterns[i].value, p->length); + + INIT_LIST_HEAD(&p->list); + list_add(&p->list, &m->patterns); + } + + if (mp_cnt != cp->pattern_count) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_INVALID_PARAMS); + goto failed; + } + + hci_dev_lock(hdev); + + prev_adv_monitors_cnt = hdev->adv_monitors_cnt; + + err = hci_add_adv_monitor(hdev, m); + if (err) { + if (err == -ENOSPC) { + mgmt_cmd_status(sk, hdev->id, + MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_NO_RESOURCES); + } + goto unlock; + } + + if (hdev->adv_monitors_cnt > prev_adv_monitors_cnt) + mgmt_adv_monitor_added(sk, hdev, m->handle); + + hci_dev_unlock(hdev); + + rp.monitor_handle = cpu_to_le16(m->handle); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_ADV_PATTERNS_MONITOR, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + +unlock: + hci_dev_unlock(hdev); + +failed: + hci_free_adv_monitor(m); + return err; +} + +static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_remove_adv_monitor *cp = data; + struct mgmt_rp_remove_adv_monitor rp; + unsigned int prev_adv_monitors_cnt; + u16 handle; + int err; + + BT_DBG("request for %s", hdev->name); + + hci_dev_lock(hdev); + + handle = __le16_to_cpu(cp->monitor_handle); + prev_adv_monitors_cnt = hdev->adv_monitors_cnt; + + err = hci_remove_adv_monitor(hdev, handle); + if (err == -ENOENT) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR, + MGMT_STATUS_INVALID_INDEX); + goto unlock; + } + + if (hdev->adv_monitors_cnt < prev_adv_monitors_cnt) + mgmt_adv_monitor_removed(sk, hdev, handle); + + hci_dev_unlock(hdev); + + rp.monitor_handle = cp->monitor_handle; + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + +unlock: + hci_dev_unlock(hdev); + return err; +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -4147,7 +4635,7 @@ static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type, *mgmt_status = mgmt_le_support(hdev); if (*mgmt_status) return false; - /* Intentional fall-through */ + fallthrough; case DISCOV_TYPE_BREDR: *mgmt_status = mgmt_bredr_support(hdev); if (*mgmt_status) @@ -4662,6 +5150,13 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, status); + /* Enabling the experimental LL Privay support disables support for + * advertising. + */ + if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + MGMT_STATUS_NOT_SUPPORTED); + if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); @@ -4848,7 +5343,7 @@ static int set_scan_params(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); - hci_req_add_le_scan_disable(&req); + hci_req_add_le_scan_disable(&req, false); hci_req_add_le_passive_scan(&req); hci_req_run(&req, NULL); @@ -5523,7 +6018,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, case MGMT_LTK_P256_DEBUG: authenticated = 0x00; type = SMP_LTK_P256_DEBUG; - /* fall through */ + fallthrough; default: continue; } @@ -5966,7 +6461,9 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_add_device *cp = data; u8 auto_conn, addr_type; + struct hci_conn_params *params; int err; + u32 current_flags = 0; bt_dev_dbg(hdev, "sock %p", sk); @@ -5993,8 +6490,9 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - err = hci_bdaddr_list_add(&hdev->whitelist, &cp->addr.bdaddr, - cp->addr.type); + err = hci_bdaddr_list_add_with_flags(&hdev->whitelist, + &cp->addr.bdaddr, + cp->addr.type, 0); if (err) goto unlock; @@ -6033,12 +6531,19 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, MGMT_STATUS_FAILED, &cp->addr, sizeof(cp->addr)); goto unlock; + } else { + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + addr_type); + if (params) + current_flags = params->current_flags; } hci_update_background_scan(hdev); added: device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); + device_flags_changed(NULL, hdev, &cp->addr.bdaddr, cp->addr.type, + SUPPORTED_DEVICE_FLAGS(), current_flags); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, MGMT_STATUS_SUCCESS, &cp->addr, @@ -6724,6 +7229,13 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES, MGMT_STATUS_REJECTED); + /* Enabling the experimental LL Privay support disables support for + * advertising. + */ + if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + MGMT_STATUS_NOT_SUPPORTED); + hci_dev_lock(hdev); rp_len = sizeof(*rp) + hdev->adv_instance_cnt; @@ -6927,6 +7439,13 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, status); + /* Enabling the experimental LL Privay support disables support for + * advertising. + */ + if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + MGMT_STATUS_NOT_SUPPORTED); + if (cp->instance < 1 || cp->instance > HCI_MAX_ADV_INSTANCES) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); @@ -7091,6 +7610,13 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, bt_dev_dbg(hdev, "sock %p", sk); + /* Enabling the experimental LL Privay support disables support for + * advertising. + */ + if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + MGMT_STATUS_NOT_SUPPORTED); + hci_dev_lock(hdev); if (cp->instance && !hci_find_adv_instance(hdev, cp->instance)) { @@ -7116,6 +7642,12 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); + /* If we use extended advertising, instance is disabled and removed */ + if (ext_adv_capable(hdev)) { + __hci_req_disable_ext_adv_instance(&req, cp->instance); + __hci_req_remove_ext_adv_instance(&req, cp->instance); + } + hci_req_clear_adv_instance(hdev, sk, &req, cp->instance, true); if (list_empty(&hdev->adv_instances)) @@ -7297,6 +7829,20 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { set_exp_feature, MGMT_SET_EXP_FEATURE_SIZE, HCI_MGMT_VAR_LEN | HCI_MGMT_HDEV_OPTIONAL }, + { read_def_system_config, MGMT_READ_DEF_SYSTEM_CONFIG_SIZE, + HCI_MGMT_UNTRUSTED }, + { set_def_system_config, MGMT_SET_DEF_SYSTEM_CONFIG_SIZE, + HCI_MGMT_VAR_LEN }, + { read_def_runtime_config, MGMT_READ_DEF_RUNTIME_CONFIG_SIZE, + HCI_MGMT_UNTRUSTED }, + { set_def_runtime_config, MGMT_SET_DEF_RUNTIME_CONFIG_SIZE, + HCI_MGMT_VAR_LEN }, + { get_device_flags, MGMT_GET_DEVICE_FLAGS_SIZE }, + { set_device_flags, MGMT_SET_DEVICE_FLAGS_SIZE }, + { read_adv_mon_features, MGMT_READ_ADV_MONITOR_FEATURES_SIZE }, + { add_adv_patterns_monitor,MGMT_ADD_ADV_PATTERNS_MONITOR_SIZE, + HCI_MGMT_VAR_LEN }, + { remove_adv_monitor, MGMT_REMOVE_ADV_MONITOR_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) @@ -8216,8 +8762,11 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, if (!hci_discovery_active(hdev)) { if (link_type == ACL_LINK) return; - if (link_type == LE_LINK && list_empty(&hdev->pend_le_reports)) + if (link_type == LE_LINK && + list_empty(&hdev->pend_le_reports) && + !hci_is_adv_monitoring(hdev)) { return; + } } if (hdev->discovery.result_filtering) { diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c new file mode 100644 index 000000000000..b30b571f8caf --- /dev/null +++ b/net/bluetooth/mgmt_config.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (C) 2020 Google Corporation + */ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> +#include <net/bluetooth/mgmt.h> + +#include "mgmt_util.h" +#include "mgmt_config.h" + +#define HDEV_PARAM_U16(_param_code_, _param_name_) \ +{ \ + { cpu_to_le16(_param_code_), sizeof(__u16) }, \ + { cpu_to_le16(hdev->_param_name_) } \ +} + +#define HDEV_PARAM_U16_JIFFIES_TO_MSECS(_param_code_, _param_name_) \ +{ \ + { cpu_to_le16(_param_code_), sizeof(__u16) }, \ + { cpu_to_le16(jiffies_to_msecs(hdev->_param_name_)) } \ +} + +int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) +{ + struct { + struct mgmt_tlv entry; + union { + /* This is a simplification for now since all values + * are 16 bits. In the future, this code may need + * refactoring to account for variable length values + * and properly calculate the required buffer size. + */ + __le16 value; + }; + } __packed params[] = { + /* Please see mgmt-api.txt for documentation of these values */ + HDEV_PARAM_U16(0x0000, def_page_scan_type), + HDEV_PARAM_U16(0x0001, def_page_scan_int), + HDEV_PARAM_U16(0x0002, def_page_scan_window), + HDEV_PARAM_U16(0x0003, def_inq_scan_type), + HDEV_PARAM_U16(0x0004, def_inq_scan_int), + HDEV_PARAM_U16(0x0005, def_inq_scan_window), + HDEV_PARAM_U16(0x0006, def_br_lsto), + HDEV_PARAM_U16(0x0007, def_page_timeout), + HDEV_PARAM_U16(0x0008, sniff_min_interval), + HDEV_PARAM_U16(0x0009, sniff_max_interval), + HDEV_PARAM_U16(0x000a, le_adv_min_interval), + HDEV_PARAM_U16(0x000b, le_adv_max_interval), + HDEV_PARAM_U16(0x000c, def_multi_adv_rotation_duration), + HDEV_PARAM_U16(0x000d, le_scan_interval), + HDEV_PARAM_U16(0x000e, le_scan_window), + HDEV_PARAM_U16(0x000f, le_scan_int_suspend), + HDEV_PARAM_U16(0x0010, le_scan_window_suspend), + HDEV_PARAM_U16(0x0011, le_scan_int_discovery), + HDEV_PARAM_U16(0x0012, le_scan_window_discovery), + HDEV_PARAM_U16(0x0013, le_scan_int_adv_monitor), + HDEV_PARAM_U16(0x0014, le_scan_window_adv_monitor), + HDEV_PARAM_U16(0x0015, le_scan_int_connect), + HDEV_PARAM_U16(0x0016, le_scan_window_connect), + HDEV_PARAM_U16(0x0017, le_conn_min_interval), + HDEV_PARAM_U16(0x0018, le_conn_max_interval), + HDEV_PARAM_U16(0x0019, le_conn_latency), + HDEV_PARAM_U16(0x001a, le_supv_timeout), + HDEV_PARAM_U16_JIFFIES_TO_MSECS(0x001b, + def_le_autoconnect_timeout), + }; + struct mgmt_rp_read_def_system_config *rp = (void *)params; + + bt_dev_dbg(hdev, "sock %p", sk); + + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_READ_DEF_SYSTEM_CONFIG, + 0, rp, sizeof(params)); +} + +#define TO_TLV(x) ((struct mgmt_tlv *)(x)) +#define TLV_GET_LE16(tlv) le16_to_cpu(*((__le16 *)(TO_TLV(tlv)->value))) + +int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) +{ + u16 buffer_left = data_len; + u8 *buffer = data; + + if (buffer_left < sizeof(struct mgmt_tlv)) { + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_DEF_SYSTEM_CONFIG, + MGMT_STATUS_INVALID_PARAMS); + } + + /* First pass to validate the tlv */ + while (buffer_left >= sizeof(struct mgmt_tlv)) { + const u8 len = TO_TLV(buffer)->length; + const u16 exp_len = sizeof(struct mgmt_tlv) + + len; + const u16 type = le16_to_cpu(TO_TLV(buffer)->type); + + if (buffer_left < exp_len) { + bt_dev_warn(hdev, "invalid len left %d, exp >= %d", + buffer_left, exp_len); + + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_DEF_SYSTEM_CONFIG, + MGMT_STATUS_INVALID_PARAMS); + } + + /* Please see mgmt-api.txt for documentation of these values */ + switch (type) { + case 0x0000: + case 0x0001: + case 0x0002: + case 0x0003: + case 0x0004: + case 0x0005: + case 0x0006: + case 0x0007: + case 0x0008: + case 0x0009: + case 0x000a: + case 0x000b: + case 0x000c: + case 0x000d: + case 0x000e: + case 0x000f: + case 0x0010: + case 0x0011: + case 0x0012: + case 0x0013: + case 0x0014: + case 0x0015: + case 0x0016: + case 0x0017: + case 0x0018: + case 0x0019: + case 0x001a: + case 0x001b: + if (len != sizeof(u16)) { + bt_dev_warn(hdev, "invalid length %d, exp %zu for type %d", + len, sizeof(u16), type); + + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_DEF_SYSTEM_CONFIG, + MGMT_STATUS_INVALID_PARAMS); + } + break; + default: + bt_dev_warn(hdev, "unsupported parameter %u", type); + break; + } + + buffer_left -= exp_len; + buffer += exp_len; + } + + buffer_left = data_len; + buffer = data; + while (buffer_left >= sizeof(struct mgmt_tlv)) { + const u8 len = TO_TLV(buffer)->length; + const u16 exp_len = sizeof(struct mgmt_tlv) + + len; + const u16 type = le16_to_cpu(TO_TLV(buffer)->type); + + switch (type) { + case 0x0000: + hdev->def_page_scan_type = TLV_GET_LE16(buffer); + break; + case 0x0001: + hdev->def_page_scan_int = TLV_GET_LE16(buffer); + break; + case 0x0002: + hdev->def_page_scan_window = TLV_GET_LE16(buffer); + break; + case 0x0003: + hdev->def_inq_scan_type = TLV_GET_LE16(buffer); + break; + case 0x0004: + hdev->def_inq_scan_int = TLV_GET_LE16(buffer); + break; + case 0x0005: + hdev->def_inq_scan_window = TLV_GET_LE16(buffer); + break; + case 0x0006: + hdev->def_br_lsto = TLV_GET_LE16(buffer); + break; + case 0x0007: + hdev->def_page_timeout = TLV_GET_LE16(buffer); + break; + case 0x0008: + hdev->sniff_min_interval = TLV_GET_LE16(buffer); + break; + case 0x0009: + hdev->sniff_max_interval = TLV_GET_LE16(buffer); + break; + case 0x000a: + hdev->le_adv_min_interval = TLV_GET_LE16(buffer); + break; + case 0x000b: + hdev->le_adv_max_interval = TLV_GET_LE16(buffer); + break; + case 0x000c: + hdev->def_multi_adv_rotation_duration = + TLV_GET_LE16(buffer); + break; + case 0x000d: + hdev->le_scan_interval = TLV_GET_LE16(buffer); + break; + case 0x000e: + hdev->le_scan_window = TLV_GET_LE16(buffer); + break; + case 0x000f: + hdev->le_scan_int_suspend = TLV_GET_LE16(buffer); + break; + case 0x0010: + hdev->le_scan_window_suspend = TLV_GET_LE16(buffer); + break; + case 0x0011: + hdev->le_scan_int_discovery = TLV_GET_LE16(buffer); + break; + case 0x00012: + hdev->le_scan_window_discovery = TLV_GET_LE16(buffer); + break; + case 0x00013: + hdev->le_scan_int_adv_monitor = TLV_GET_LE16(buffer); + break; + case 0x00014: + hdev->le_scan_window_adv_monitor = TLV_GET_LE16(buffer); + break; + case 0x00015: + hdev->le_scan_int_connect = TLV_GET_LE16(buffer); + break; + case 0x00016: + hdev->le_scan_window_connect = TLV_GET_LE16(buffer); + break; + case 0x00017: + hdev->le_conn_min_interval = TLV_GET_LE16(buffer); + break; + case 0x00018: + hdev->le_conn_max_interval = TLV_GET_LE16(buffer); + break; + case 0x00019: + hdev->le_conn_latency = TLV_GET_LE16(buffer); + break; + case 0x0001a: + hdev->le_supv_timeout = TLV_GET_LE16(buffer); + break; + case 0x0001b: + hdev->def_le_autoconnect_timeout = + msecs_to_jiffies(TLV_GET_LE16(buffer)); + break; + default: + bt_dev_warn(hdev, "unsupported parameter %u", type); + break; + } + + buffer_left -= exp_len; + buffer += exp_len; + } + + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_DEF_SYSTEM_CONFIG, 0, NULL, 0); +} + +int read_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) +{ + bt_dev_dbg(hdev, "sock %p", sk); + + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_READ_DEF_RUNTIME_CONFIG, 0, NULL, 0); +} + +int set_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) +{ + bt_dev_dbg(hdev, "sock %p", sk); + + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEF_SYSTEM_CONFIG, + MGMT_STATUS_INVALID_PARAMS); +} diff --git a/net/bluetooth/mgmt_config.h b/net/bluetooth/mgmt_config.h new file mode 100644 index 000000000000..a4965f107891 --- /dev/null +++ b/net/bluetooth/mgmt_config.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (C) 2020 Google Corporation + */ + +int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len); + +int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len); + +int read_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len); + +int set_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len); diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index d6c4e6b5ae77..8579bfeb2836 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -139,3 +139,10 @@ void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) bt_dev_dbg(hdev, "MSFT vendor event %u", event); } + +__u64 msft_get_features(struct hci_dev *hdev) +{ + struct msft_data *msft = hdev->msft_data; + + return msft ? msft->features : 0; +} diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h index 5aa9130e1f8a..e9c478e890b8 100644 --- a/net/bluetooth/msft.h +++ b/net/bluetooth/msft.h @@ -3,16 +3,25 @@ * Copyright (C) 2020 Google Corporation */ +#define MSFT_FEATURE_MASK_BREDR_RSSI_MONITOR BIT(0) +#define MSFT_FEATURE_MASK_LE_CONN_RSSI_MONITOR BIT(1) +#define MSFT_FEATURE_MASK_LE_ADV_RSSI_MONITOR BIT(2) +#define MSFT_FEATURE_MASK_LE_ADV_MONITOR BIT(3) +#define MSFT_FEATURE_MASK_CURVE_VALIDITY BIT(4) +#define MSFT_FEATURE_MASK_CONCURRENT_ADV_MONITOR BIT(5) + #if IS_ENABLED(CONFIG_BT_MSFTEXT) void msft_do_open(struct hci_dev *hdev); void msft_do_close(struct hci_dev *hdev); void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb); +__u64 msft_get_features(struct hci_dev *hdev); #else static inline void msft_do_open(struct hci_dev *hdev) {} static inline void msft_do_close(struct hci_dev *hdev) {} static inline void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) {} +static inline __u64 msft_get_features(struct hci_dev *hdev) { return 0; } #endif diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 2e20af317cea..f2bacb464ccf 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -479,7 +479,7 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) /* if closing a dlc in a session that hasn't been started, * just close and unlink the dlc */ - /* fall through */ + fallthrough; default: rfcomm_dlc_clear_timer(d); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index df14eebe80da..ae6f80730561 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -218,7 +218,7 @@ static void __rfcomm_sock_close(struct sock *sk) case BT_CONFIG: case BT_CONNECTED: rfcomm_dlc_close(d, 0); - /* fall through */ + fallthrough; default: sock_set_flag(sk, SOCK_ZAPPED); @@ -644,7 +644,8 @@ static int rfcomm_sock_recvmsg(struct socket *sock, struct msghdr *msg, return len; } -static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen) +static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; @@ -656,7 +657,7 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __u switch (optname) { case RFCOMM_LM: - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -685,7 +686,8 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __u return err; } -static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct bt_security sec; @@ -713,7 +715,7 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c sec.level = BT_SECURITY_LOW; len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_user((char *) &sec, optval, len)) { + if (copy_from_sockptr(&sec, optval, len)) { err = -EFAULT; break; } @@ -732,7 +734,7 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index c8c3d38cdc7b..dcf7f96ff417 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -66,6 +66,7 @@ struct sco_pinfo { bdaddr_t dst; __u32 flags; __u16 setting; + __u8 cmsg_mask; struct sco_conn *conn; }; @@ -449,6 +450,15 @@ static void sco_sock_close(struct sock *sk) sco_sock_kill(sk); } +static void sco_skb_put_cmsg(struct sk_buff *skb, struct msghdr *msg, + struct sock *sk) +{ + if (sco_pi(sk)->cmsg_mask & SCO_CMSG_PKT_STATUS) + put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS, + sizeof(bt_cb(skb)->sco.pkt_status), + &bt_cb(skb)->sco.pkt_status); +} + static void sco_sock_init(struct sock *sk, struct sock *parent) { BT_DBG("sk %p", sk); @@ -457,6 +467,8 @@ static void sco_sock_init(struct sock *sk, struct sock *parent) sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; security_sk_clone(parent, sk); + } else { + bt_sk(sk)->skb_put_cmsg = sco_skb_put_cmsg; } } @@ -791,7 +803,7 @@ static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg, } static int sco_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int len, err = 0; @@ -810,7 +822,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -831,7 +843,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, voice.setting = sco_pi(sk)->setting; len = min_t(unsigned int, sizeof(voice), optlen); - if (copy_from_user((char *)&voice, optval, len)) { + if (copy_from_sockptr(&voice, optval, len)) { err = -EFAULT; break; } @@ -846,6 +858,18 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, sco_pi(sk)->setting = voice.setting; break; + case BT_PKT_STATUS: + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { + err = -EFAULT; + break; + } + + if (opt) + sco_pi(sk)->cmsg_mask |= SCO_CMSG_PKT_STATUS; + else + sco_pi(sk)->cmsg_mask &= SCO_CMSG_PKT_STATUS; + break; + default: err = -ENOPROTOOPT; break; @@ -923,6 +947,7 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, int len, err = 0; struct bt_voice voice; u32 phys; + int pkt_status; BT_DBG("sk %p", sk); @@ -969,6 +994,13 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, err = -EFAULT; break; + case BT_PKT_STATUS: + pkt_status = (sco_pi(sk)->cmsg_mask & SCO_CMSG_PKT_STATUS); + + if (put_user(pkt_status, (int __user *)optval)) + err = -EFAULT; + break; + default: err = -ENOPROTOOPT; break; diff --git a/net/bluetooth/selftest.c b/net/bluetooth/selftest.c index 03e3c89c3046..f71c6fa65fb3 100644 --- a/net/bluetooth/selftest.c +++ b/net/bluetooth/selftest.c @@ -205,7 +205,7 @@ static int __init test_ecdh(void) calltime = ktime_get(); - tfm = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + tfm = crypto_alloc_kpp("ecdh", 0, 0); if (IS_ERR(tfm)) { BT_ERR("Unable to create ECDH crypto context"); err = PTR_ERR(tfm); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index c2c5ab05fa7e..433227f96c73 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1387,7 +1387,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) goto zfree_smp; } - smp->tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + smp->tfm_ecdh = crypto_alloc_kpp("ecdh", 0, 0); if (IS_ERR(smp->tfm_ecdh)) { BT_ERR("Unable to create ECDH crypto context"); goto free_shash; @@ -1654,7 +1654,7 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) memset(smp->tk, 0, sizeof(smp->tk)); BT_DBG("PassKey: %d", value); put_unaligned_le32(value, smp->tk); - /* Fall Through */ + fallthrough; case MGMT_OP_USER_CONFIRM_REPLY: set_bit(SMP_FLAG_TK_VALID, &smp->flags); break; @@ -3282,7 +3282,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) return ERR_CAST(tfm_cmac); } - tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + tfm_ecdh = crypto_alloc_kpp("ecdh", 0, 0); if (IS_ERR(tfm_ecdh)) { BT_ERR("Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); @@ -3847,7 +3847,7 @@ int __init bt_selftest_smp(void) return PTR_ERR(tfm_cmac); } - tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + tfm_ecdh = crypto_alloc_kpp("ecdh", 0, 0); if (IS_ERR(tfm_ecdh)) { BT_ERR("Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index b03c469cd01f..99eb8c6c0fbc 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -327,6 +327,12 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) /* priority is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority), + offsetof(struct __sk_buff, ifindex))) + return -EINVAL; + + /* ifindex is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, ifindex), offsetof(struct __sk_buff, cb))) return -EINVAL; @@ -381,6 +387,7 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) __skb->mark = skb->mark; __skb->priority = skb->priority; + __skb->ifindex = skb->dev->ifindex; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); __skb->wire_len = cb->pkt_len; @@ -391,6 +398,8 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool is_l2 = false, is_direct_pkt_access = false; + struct net *net = current->nsproxy->net_ns; + struct net_device *dev = net->loopback_dev; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct __sk_buff *ctx = NULL; @@ -432,7 +441,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, kfree(ctx); return -ENOMEM; } - sock_net_set(sk, current->nsproxy->net_ns); + sock_net_set(sk, net); sock_init_data(NULL, sk); skb = build_skb(data, 0); @@ -446,9 +455,37 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); - skb->protocol = eth_type_trans(skb, current->nsproxy->net_ns->loopback_dev); + if (ctx && ctx->ifindex > 1) { + dev = dev_get_by_index(net, ctx->ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + } + skb->protocol = eth_type_trans(skb, dev); skb_reset_network_header(skb); + switch (skb->protocol) { + case htons(ETH_P_IP): + sk->sk_family = AF_INET; + if (sizeof(struct iphdr) <= skb_headlen(skb)) { + sk->sk_rcv_saddr = ip_hdr(skb)->saddr; + sk->sk_daddr = ip_hdr(skb)->daddr; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + sk->sk_family = AF_INET6; + if (sizeof(struct ipv6hdr) <= skb_headlen(skb)) { + sk->sk_v6_rcv_saddr = ipv6_hdr(skb)->saddr; + sk->sk_v6_daddr = ipv6_hdr(skb)->daddr; + } + break; +#endif + default: + break; + } + if (is_l2) __skb_push(skb, hh_len); if (is_direct_pkt_access) @@ -481,6 +518,8 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct __sk_buff)); out: + if (dev && dev != net->loopback_dev) + dev_put(dev); kfree_skb(skb); bpf_sk_storage_free(sk); kfree(sk); diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig index 84015ef3ee27..73d0b12789f1 100644 --- a/net/bpfilter/Kconfig +++ b/net/bpfilter/Kconfig @@ -9,12 +9,14 @@ menuconfig BPFILTER if BPFILTER config BPFILTER_UMH tristate "bpfilter kernel module with user mode helper" - depends on CC_CAN_LINK_STATIC + depends on CC_CAN_LINK + depends on m || CC_CAN_LINK_STATIC default m help This builds bpfilter kernel module with embedded user mode helper - Note: your toolchain must support building static binaries, since - rootfs isn't mounted at the time when __init functions are called - and do_execv won't be able to find the elf interpreter. + Note: To compile this as built-in, your toolchain must support + building static binaries, since rootfs isn't mounted at the time + when __init functions are called and do_execv won't be able to find + the elf interpreter. endif diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index f23b53294fba..cdac82b8c53a 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -7,10 +7,12 @@ userprogs := bpfilter_umh bpfilter_umh-objs := main.o userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi +ifeq ($(CONFIG_BPFILTER_UMH), y) # builtin bpfilter_umh should be linked with -static # since rootfs isn't mounted at the time of __init # function is called and do_execv won't find elf interpreter userldflags += -static +endif $(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index c3146b2700a0..51a941b56ec3 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -31,47 +31,55 @@ static void __stop_umh(void) shutdown_umh(); } -static int __bpfilter_process_sockopt(struct sock *sk, int optname, - char __user *optval, - unsigned int optlen, bool is_set) +static int bpfilter_send_req(struct mbox_request *req) { - struct mbox_request req; struct mbox_reply reply; loff_t pos = 0; ssize_t n; - int ret = -EFAULT; - req.is_set = is_set; - req.pid = current->pid; - req.cmd = optname; - req.addr = (long __force __user)optval; - req.len = optlen; if (!bpfilter_ops.info.tgid) - goto out; - n = kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req), + return -EFAULT; + pos = 0; + n = kernel_write(bpfilter_ops.info.pipe_to_umh, req, sizeof(*req), &pos); - if (n != sizeof(req)) { + if (n != sizeof(*req)) { pr_err("write fail %zd\n", n); - __stop_umh(); - ret = -EFAULT; - goto out; + goto stop; } pos = 0; n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply), &pos); if (n != sizeof(reply)) { pr_err("read fail %zd\n", n); - __stop_umh(); - ret = -EFAULT; - goto out; + goto stop; + } + return reply.status; +stop: + __stop_umh(); + return -EFAULT; +} + +static int bpfilter_process_sockopt(struct sock *sk, int optname, + sockptr_t optval, unsigned int optlen, + bool is_set) +{ + struct mbox_request req = { + .is_set = is_set, + .pid = current->pid, + .cmd = optname, + .addr = (uintptr_t)optval.user, + .len = optlen, + }; + if (uaccess_kernel() || sockptr_is_kernel(optval)) { + pr_err("kernel access not supported\n"); + return -EFAULT; } - ret = reply.status; -out: - return ret; + return bpfilter_send_req(&req); } static int start_umh(void) { + struct mbox_request req = { .pid = current->pid }; int err; /* fork usermode process */ @@ -81,7 +89,7 @@ static int start_umh(void) pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid)); /* health check that usermode process started correctly */ - if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { + if (bpfilter_send_req(&req) != 0) { shutdown_umh(); return -EFAULT; } @@ -102,7 +110,7 @@ static int __init load_umh(void) mutex_lock(&bpfilter_ops.lock); err = start_umh(); if (!err && IS_ENABLED(CONFIG_INET)) { - bpfilter_ops.sockopt = &__bpfilter_process_sockopt; + bpfilter_ops.sockopt = &bpfilter_process_sockopt; bpfilter_ops.start = &start_umh; } mutex_unlock(&bpfilter_ops.lock); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 8c7b78f8bc23..9a2fb4aa1a10 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -36,6 +36,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) const unsigned char *dest; u16 vid = 0; + memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); + rcu_read_lock(); nf_ops = rcu_dereference(nf_br_ops); if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) { diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 4877a0db16c6..9db504baa094 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -349,12 +349,21 @@ void br_fdb_cleanup(struct work_struct *work) */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { - unsigned long this_timer; + unsigned long this_timer = f->updated + delay; if (test_bit(BR_FDB_STATIC, &f->flags) || - test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) + test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) { + if (test_bit(BR_FDB_NOTIFY, &f->flags)) { + if (time_after(this_timer, now)) + work_delay = min(work_delay, + this_timer - now); + else if (!test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, + &f->flags)) + fdb_notify(br, f, RTM_NEWNEIGH, false); + } continue; - this_timer = f->updated + delay; + } + if (time_after(this_timer, now)) { work_delay = min(work_delay, this_timer - now); } else { @@ -556,11 +565,17 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, return ret; } +/* returns true if the fdb was modified */ +static bool __fdb_mark_active(struct net_bridge_fdb_entry *fdb) +{ + return !!(test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags) && + test_and_clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)); +} + void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; - bool fdb_modified = false; /* some users want to always flood. */ if (hold_time(br) == 0) @@ -575,6 +590,12 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, source->dev->name, addr, vid); } else { unsigned long now = jiffies; + bool fdb_modified = false; + + if (now != fdb->updated) { + fdb->updated = now; + fdb_modified = __fdb_mark_active(fdb); + } /* fastpath: update of existing entry */ if (unlikely(source != fdb->dst && @@ -587,8 +608,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); } - if (now != fdb->updated) - fdb->updated = now; + if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (unlikely(fdb_modified)) { @@ -667,6 +687,23 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, &fdb->key.vlan_id)) goto nla_put_failure; + if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) { + struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS); + u8 notify_bits = FDB_NOTIFY_BIT; + + if (!nest) + goto nla_put_failure; + if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) + notify_bits |= FDB_NOTIFY_INACTIVE_BIT; + + if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) { + nla_nest_cancel(skb, nest); + goto nla_put_failure; + } + + nla_nest_end(skb, nest); + } + nlmsg_end(skb, nlh); return 0; @@ -681,7 +718,9 @@ static inline size_t fdb_nlmsg_size(void) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + nla_total_size(sizeof(u16)) /* NDA_VLAN */ - + nla_total_size(sizeof(struct nda_cacheinfo)); + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ + + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } static void fdb_notify(struct net_bridge *br, @@ -791,14 +830,41 @@ errout: return err; } +/* returns true if the fdb is modified */ +static bool fdb_handle_notify(struct net_bridge_fdb_entry *fdb, u8 notify) +{ + bool modified = false; + + /* allow to mark an entry as inactive, usually done on creation */ + if ((notify & FDB_NOTIFY_INACTIVE_BIT) && + !test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) + modified = true; + + if ((notify & FDB_NOTIFY_BIT) && + !test_and_set_bit(BR_FDB_NOTIFY, &fdb->flags)) { + /* enabled activity tracking */ + modified = true; + } else if (!(notify & FDB_NOTIFY_BIT) && + test_and_clear_bit(BR_FDB_NOTIFY, &fdb->flags)) { + /* disabled activity tracking, clear notify state */ + clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags); + modified = true; + } + + return modified; +} + /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, - const u8 *addr, u16 state, u16 flags, u16 vid, - u8 ndm_flags) + const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid, + struct nlattr *nfea_tb[]) { - bool is_sticky = !!(ndm_flags & NTF_STICKY); + bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY); + bool refresh = !nfea_tb[NFEA_DONT_REFRESH]; struct net_bridge_fdb_entry *fdb; + u16 state = ndm->ndm_state; bool modified = false; + u8 notify = 0; /* If the port cannot learn allow only local and static entries */ if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) && @@ -815,6 +881,13 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (is_sticky && (state & NUD_PERMANENT)) return -EINVAL; + if (nfea_tb[NFEA_ACTIVITY_NOTIFY]) { + notify = nla_get_u8(nfea_tb[NFEA_ACTIVITY_NOTIFY]); + if ((notify & ~BR_FDB_NOTIFY_SETTABLE_BITS) || + (notify & BR_FDB_NOTIFY_SETTABLE_BITS) == FDB_NOTIFY_INACTIVE_BIT) + return -EINVAL; + } + fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) @@ -858,11 +931,15 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, modified = true; } + if (fdb_handle_notify(fdb, notify)) + modified = true; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); fdb->used = jiffies; if (modified) { - fdb->updated = jiffies; + if (refresh) + fdb->updated = jiffies; fdb_notify(br, fdb, RTM_NEWNEIGH, true); } @@ -871,7 +948,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, - u16 nlh_flags, u16 vid) + u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[]) { int err = 0; @@ -893,20 +970,25 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, err = br_fdb_external_learn_add(br, p, addr, vid, true); } else { spin_lock_bh(&br->hash_lock); - err = fdb_add_entry(br, p, addr, ndm->ndm_state, - nlh_flags, vid, ndm->ndm_flags); + err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb); spin_unlock_bh(&br->hash_lock); } return err; } +static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { + [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 }, + [NFEA_DONT_REFRESH] = { .type = NLA_FLAG }, +}; + /* Add new permanent fdb entry with RTM_NEWNEIGH */ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, struct netlink_ext_ack *extack) { + struct nlattr *nfea_tb[NFEA_MAX + 1], *attr; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; @@ -939,6 +1021,16 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], vg = nbp_vlan_group(p); } + if (tb[NDA_FDB_EXT_ATTRS]) { + attr = tb[NDA_FDB_EXT_ATTRS]; + err = nla_parse_nested(nfea_tb, NFEA_MAX, attr, + br_nda_fdb_pol, extack); + if (err) + return err; + } else { + memset(nfea_tb, 0, sizeof(struct nlattr *) * (NFEA_MAX + 1)); + } + if (vid) { v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) { @@ -947,9 +1039,9 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], } /* VID was specified, so use it. */ - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb); } else { - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb); if (err || !vg || !vg->num_vlans) goto out; @@ -960,7 +1052,8 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid, + nfea_tb); if (err) goto out; } diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 90592af9db61..b36689e6e7cb 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -4,6 +4,27 @@ #include "br_private_mrp.h" static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 }; +static const u8 mrp_in_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x3 }; + +static bool br_mrp_is_ring_port(struct net_bridge_port *p_port, + struct net_bridge_port *s_port, + struct net_bridge_port *port) +{ + if (port == p_port || + port == s_port) + return true; + + return false; +} + +static bool br_mrp_is_in_port(struct net_bridge_port *i_port, + struct net_bridge_port *port) +{ + if (port == i_port) + return true; + + return false; +} static struct net_bridge_port *br_mrp_get_port(struct net_bridge *br, u32 ifindex) @@ -37,6 +58,22 @@ static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id) return res; } +static struct br_mrp *br_mrp_find_in_id(struct net_bridge *br, u32 in_id) +{ + struct br_mrp *res = NULL; + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + if (mrp->in_id == in_id) { + res = mrp; + break; + } + } + + return res; +} + static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex) { struct br_mrp *mrp; @@ -52,6 +89,10 @@ static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex) p = rtnl_dereference(mrp->s_port); if (p && p->dev->ifindex == ifindex) return false; + + p = rtnl_dereference(mrp->i_port); + if (p && p->dev->ifindex == ifindex) + return false; } return true; @@ -66,7 +107,8 @@ static struct br_mrp *br_mrp_find_port(struct net_bridge *br, list_for_each_entry_rcu(mrp, &br->mrp_list, list, lockdep_rtnl_is_held()) { if (rcu_access_pointer(mrp->p_port) == p || - rcu_access_pointer(mrp->s_port) == p) { + rcu_access_pointer(mrp->s_port) == p || + rcu_access_pointer(mrp->i_port) == p) { res = mrp; break; } @@ -160,6 +202,36 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, return skb; } +static struct sk_buff *br_mrp_alloc_in_test_skb(struct br_mrp *mrp, + struct net_bridge_port *p, + enum br_mrp_port_role_type port_role) +{ + struct br_mrp_in_test_hdr *hdr = NULL; + struct sk_buff *skb = NULL; + + if (!p) + return NULL; + + skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_in_test_dmac); + if (!skb) + return NULL; + + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_IN_TEST, sizeof(*hdr)); + hdr = skb_put(skb, sizeof(*hdr)); + + hdr->id = cpu_to_be16(mrp->in_id); + ether_addr_copy(hdr->sa, p->br->dev->dev_addr); + hdr->port_role = cpu_to_be16(port_role); + hdr->state = cpu_to_be16(mrp->in_state); + hdr->transitions = cpu_to_be16(mrp->in_transitions); + hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies)); + + br_mrp_skb_common(skb, mrp); + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0); + + return skb; +} + /* This function is continuously called in the following cases: * - when node role is MRM, in this case test_monitor is always set to false * because it needs to notify the userspace that the ring is open and needs to @@ -213,7 +285,7 @@ static void br_mrp_test_work_expired(struct work_struct *work) } if (notify_open && !mrp->ring_role_offloaded) - br_mrp_port_open(p->dev, true); + br_mrp_ring_port_open(p->dev, true); } p = rcu_dereference(mrp->s_port); @@ -229,7 +301,7 @@ static void br_mrp_test_work_expired(struct work_struct *work) } if (notify_open && !mrp->ring_role_offloaded) - br_mrp_port_open(p->dev, true); + br_mrp_ring_port_open(p->dev, true); } out: @@ -239,6 +311,83 @@ out: usecs_to_jiffies(mrp->test_interval)); } +/* This function is continuously called when the node has the interconnect role + * MIM. It would generate interconnect test frames and will send them on all 3 + * ports. But will also check if it stop receiving interconnect test frames. + */ +static void br_mrp_in_test_work_expired(struct work_struct *work) +{ + struct delayed_work *del_work = to_delayed_work(work); + struct br_mrp *mrp = container_of(del_work, struct br_mrp, in_test_work); + struct net_bridge_port *p; + bool notify_open = false; + struct sk_buff *skb; + + if (time_before_eq(mrp->in_test_end, jiffies)) + return; + + if (mrp->in_test_count_miss < mrp->in_test_max_miss) { + mrp->in_test_count_miss++; + } else { + /* Notify that the interconnect ring is open only if the + * interconnect ring state is closed, otherwise it would + * continue to notify at every interval. + */ + if (mrp->in_state == BR_MRP_IN_STATE_CLOSED) + notify_open = true; + } + + rcu_read_lock(); + + p = rcu_dereference(mrp->p_port); + if (p) { + skb = br_mrp_alloc_in_test_skb(mrp, p, + BR_MRP_PORT_ROLE_PRIMARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->in_role_offloaded) + br_mrp_in_port_open(p->dev, true); + } + + p = rcu_dereference(mrp->s_port); + if (p) { + skb = br_mrp_alloc_in_test_skb(mrp, p, + BR_MRP_PORT_ROLE_SECONDARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->in_role_offloaded) + br_mrp_in_port_open(p->dev, true); + } + + p = rcu_dereference(mrp->i_port); + if (p) { + skb = br_mrp_alloc_in_test_skb(mrp, p, + BR_MRP_PORT_ROLE_INTER); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->in_role_offloaded) + br_mrp_in_port_open(p->dev, true); + } + +out: + rcu_read_unlock(); + + queue_delayed_work(system_wq, &mrp->in_test_work, + usecs_to_jiffies(mrp->in_test_interval)); +} + /* Deletes the MRP instance. * note: called under rtnl_lock */ @@ -251,6 +400,10 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) cancel_delayed_work_sync(&mrp->test_work); br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0, 0); + /* Stop sending MRP_InTest frames if has an interconnect role */ + cancel_delayed_work_sync(&mrp->in_test_work); + br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0); + br_mrp_switchdev_del(br, mrp); /* Reset the ports */ @@ -278,6 +431,18 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) rcu_assign_pointer(mrp->s_port, NULL); } + p = rtnl_dereference(mrp->i_port); + if (p) { + spin_lock_bh(&br->lock); + state = netif_running(br->dev) ? + BR_STATE_FORWARDING : BR_STATE_DISABLED; + p->state = state; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, state); + rcu_assign_pointer(mrp->i_port, NULL); + } + list_del_rcu(&mrp->list); kfree_rcu(mrp, rcu); } @@ -329,6 +494,7 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) rcu_assign_pointer(mrp->s_port, p); INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired); + INIT_DELAYED_WORK(&mrp->in_test_work, br_mrp_in_test_work_expired); list_add_tail_rcu(&mrp->list, &br->mrp_list); err = br_mrp_switchdev_add(br, mrp); @@ -511,6 +677,180 @@ int br_mrp_start_test(struct net_bridge *br, return 0; } +/* Set in state, int state can be only Open or Closed + * note: already called with rtnl_lock + */ +int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state) +{ + struct br_mrp *mrp = br_mrp_find_in_id(br, state->in_id); + + if (!mrp) + return -EINVAL; + + if (mrp->in_state == BR_MRP_IN_STATE_CLOSED && + state->in_state != BR_MRP_IN_STATE_CLOSED) + mrp->in_transitions++; + + mrp->in_state = state->in_state; + + br_mrp_switchdev_set_in_state(br, mrp, state->in_state); + + return 0; +} + +/* Set in role, in role can be only MIM(Media Interconnection Manager) or + * MIC(Media Interconnection Client). + * note: already called with rtnl_lock + */ +int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role) +{ + struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); + struct net_bridge_port *p; + int err; + + if (!mrp) + return -EINVAL; + + if (!br_mrp_get_port(br, role->i_ifindex)) + return -EINVAL; + + if (role->in_role == BR_MRP_IN_ROLE_DISABLED) { + u8 state; + + /* It is not allowed to disable a port that doesn't exist */ + p = rtnl_dereference(mrp->i_port); + if (!p) + return -EINVAL; + + /* Stop the generating MRP_InTest frames */ + cancel_delayed_work_sync(&mrp->in_test_work); + br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0); + + /* Remove the port */ + spin_lock_bh(&br->lock); + state = netif_running(br->dev) ? + BR_STATE_FORWARDING : BR_STATE_DISABLED; + p->state = state; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, state); + rcu_assign_pointer(mrp->i_port, NULL); + + mrp->in_role = role->in_role; + mrp->in_id = 0; + + return 0; + } + + /* It is not possible to have the same port part of multiple rings */ + if (!br_mrp_unique_ifindex(br, role->i_ifindex)) + return -EINVAL; + + /* It is not allowed to set a different interconnect port if the mrp + * instance has already one. First it needs to be disabled and after + * that set the new port + */ + if (rcu_access_pointer(mrp->i_port)) + return -EINVAL; + + p = br_mrp_get_port(br, role->i_ifindex); + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags |= BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + rcu_assign_pointer(mrp->i_port, p); + + mrp->in_role = role->in_role; + mrp->in_id = role->in_id; + + /* If there is an error just bailed out */ + err = br_mrp_switchdev_set_in_role(br, mrp, role->in_id, + role->ring_id, role->in_role); + if (err && err != -EOPNOTSUPP) + return err; + + /* Now detect if the HW actually applied the role or not. If the HW + * applied the role it means that the SW will not to do those operations + * anymore. For example if the role is MIM then the HW will notify the + * SW when interconnect ring is open, but if the is not pushed to the HW + * the SW will need to detect when the interconnect ring is open. + */ + mrp->in_role_offloaded = err == -EOPNOTSUPP ? 0 : 1; + + return 0; +} + +/* Start to generate MRP_InTest frames, the frames are generated by + * HW and if it fails, they are generated by the SW. + * note: already called with rtnl_lock + */ +int br_mrp_start_in_test(struct net_bridge *br, + struct br_mrp_start_in_test *in_test) +{ + struct br_mrp *mrp = br_mrp_find_in_id(br, in_test->in_id); + + if (!mrp) + return -EINVAL; + + if (mrp->in_role != BR_MRP_IN_ROLE_MIM) + return -EINVAL; + + /* Try to push it to the HW and if it fails then continue with SW + * implementation and if that also fails then return error. + */ + if (!br_mrp_switchdev_send_in_test(br, mrp, in_test->interval, + in_test->max_miss, in_test->period)) + return 0; + + mrp->in_test_interval = in_test->interval; + mrp->in_test_end = jiffies + usecs_to_jiffies(in_test->period); + mrp->in_test_max_miss = in_test->max_miss; + mrp->in_test_count_miss = 0; + queue_delayed_work(system_wq, &mrp->in_test_work, + usecs_to_jiffies(in_test->interval)); + + return 0; +} + +/* Determin if the frame type is a ring frame */ +static bool br_mrp_ring_frame(struct sk_buff *skb) +{ + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return false; + + if (hdr->type == BR_MRP_TLV_HEADER_RING_TEST || + hdr->type == BR_MRP_TLV_HEADER_RING_TOPO || + hdr->type == BR_MRP_TLV_HEADER_RING_LINK_DOWN || + hdr->type == BR_MRP_TLV_HEADER_RING_LINK_UP || + hdr->type == BR_MRP_TLV_HEADER_OPTION) + return true; + + return false; +} + +/* Determin if the frame type is an interconnect frame */ +static bool br_mrp_in_frame(struct sk_buff *skb) +{ + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return false; + + if (hdr->type == BR_MRP_TLV_HEADER_IN_TEST || + hdr->type == BR_MRP_TLV_HEADER_IN_TOPO || + hdr->type == BR_MRP_TLV_HEADER_IN_LINK_DOWN || + hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP) + return true; + + return false; +} + /* Process only MRP Test frame. All the other MRP frames are processed by * userspace application * note: already called with rcu_read_lock @@ -537,7 +877,7 @@ static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port, * not closed */ if (mrp->ring_state != BR_MRP_RING_STATE_CLOSED) - br_mrp_port_open(port->dev, false); + br_mrp_ring_port_open(port->dev, false); } /* Determin if the test hdr has a better priority than the node */ @@ -591,17 +931,92 @@ static void br_mrp_mra_process(struct br_mrp *mrp, struct net_bridge *br, mrp->test_count_miss = 0; } -/* This will just forward the frame to the other mrp ring port(MRC role) or will - * not do anything. +/* Process only MRP InTest frame. All the other MRP frames are processed by + * userspace application + * note: already called with rcu_read_lock + */ +static bool br_mrp_mim_process(struct br_mrp *mrp, struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct br_mrp_in_test_hdr *in_hdr; + struct br_mrp_in_test_hdr _in_hdr; + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return false; + + /* The check for InTest frame type was already done */ + in_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr), + sizeof(_in_hdr), &_in_hdr); + if (!in_hdr) + return false; + + /* It needs to process only it's own InTest frames. */ + if (mrp->in_id != ntohs(in_hdr->id)) + return false; + + mrp->in_test_count_miss = 0; + + /* Notify the userspace that the ring is closed only when the ring is + * not closed + */ + if (mrp->in_state != BR_MRP_IN_STATE_CLOSED) + br_mrp_in_port_open(port->dev, false); + + return true; +} + +/* Get the MRP frame type + * note: already called with rcu_read_lock + */ +static u8 br_mrp_get_frame_type(struct sk_buff *skb) +{ + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return 0xff; + + return hdr->type; +} + +static bool br_mrp_mrm_behaviour(struct br_mrp *mrp) +{ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRM || + (mrp->ring_role == BR_MRP_RING_ROLE_MRA && !mrp->test_monitor)) + return true; + + return false; +} + +static bool br_mrp_mrc_behaviour(struct br_mrp *mrp) +{ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRC || + (mrp->ring_role == BR_MRP_RING_ROLE_MRA && mrp->test_monitor)) + return true; + + return false; +} + +/* This will just forward the frame to the other mrp ring ports, depending on + * the frame type, ring role and interconnect role * note: already called with rcu_read_lock */ static int br_mrp_rcv(struct net_bridge_port *p, struct sk_buff *skb, struct net_device *dev) { - struct net_device *s_dev, *p_dev, *d_dev; - struct net_bridge_port *p_port, *s_port; + struct net_bridge_port *p_port, *s_port, *i_port = NULL; + struct net_bridge_port *p_dst, *s_dst, *i_dst = NULL; struct net_bridge *br; - struct sk_buff *nskb; struct br_mrp *mrp; /* If port is disabled don't accept any frames */ @@ -616,46 +1031,139 @@ static int br_mrp_rcv(struct net_bridge_port *p, p_port = rcu_dereference(mrp->p_port); if (!p_port) return 0; + p_dst = p_port; s_port = rcu_dereference(mrp->s_port); if (!s_port) return 0; + s_dst = s_port; - /* If the role is MRM then don't forward the frames */ - if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) { - br_mrp_mrm_process(mrp, p, skb); - return 1; - } - - /* If the role is MRA then don't forward the frames if it behaves as - * MRM node + /* If the frame is a ring frame then it is not required to check the + * interconnect role and ports to process or forward the frame */ - if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { - if (!mrp->test_monitor) { + if (br_mrp_ring_frame(skb)) { + /* If the role is MRM then don't forward the frames */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) { br_mrp_mrm_process(mrp, p, skb); - return 1; + goto no_forward; } - br_mrp_mra_process(mrp, br, p, skb); + /* If the role is MRA then don't forward the frames if it + * behaves as MRM node + */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { + if (!mrp->test_monitor) { + br_mrp_mrm_process(mrp, p, skb); + goto no_forward; + } + + br_mrp_mra_process(mrp, br, p, skb); + } + + goto forward; } - /* Clone the frame and forward it on the other MRP port */ - nskb = skb_clone(skb, GFP_ATOMIC); - if (!nskb) - return 0; + if (br_mrp_in_frame(skb)) { + u8 in_type = br_mrp_get_frame_type(skb); - p_dev = p_port->dev; - s_dev = s_port->dev; + i_port = rcu_dereference(mrp->i_port); + i_dst = i_port; - if (p_dev == dev) - d_dev = s_dev; - else - d_dev = p_dev; + /* If the ring port is in block state it should not forward + * In_Test frames + */ + if (br_mrp_is_ring_port(p_port, s_port, p) && + p->state == BR_STATE_BLOCKING && + in_type == BR_MRP_TLV_HEADER_IN_TEST) + goto no_forward; + + /* Nodes that behaves as MRM needs to stop forwarding the + * frames in case the ring is closed, otherwise will be a loop. + * In this case the frame is no forward between the ring ports. + */ + if (br_mrp_mrm_behaviour(mrp) && + br_mrp_is_ring_port(p_port, s_port, p) && + (s_port->state != BR_STATE_FORWARDING || + p_port->state != BR_STATE_FORWARDING)) { + p_dst = NULL; + s_dst = NULL; + } + + /* A node that behaves as MRC and doesn't have a interconnect + * role then it should forward all frames between the ring ports + * because it doesn't have an interconnect port + */ + if (br_mrp_mrc_behaviour(mrp) && + mrp->in_role == BR_MRP_IN_ROLE_DISABLED) + goto forward; + + if (mrp->in_role == BR_MRP_IN_ROLE_MIM) { + if (in_type == BR_MRP_TLV_HEADER_IN_TEST) { + /* MIM should not forward it's own InTest + * frames + */ + if (br_mrp_mim_process(mrp, p, skb)) { + goto no_forward; + } else { + if (br_mrp_is_ring_port(p_port, s_port, + p)) + i_dst = NULL; + + if (br_mrp_is_in_port(i_port, p)) + goto no_forward; + } + } else { + /* MIM should forward IntLinkChange and + * IntTopoChange between ring ports but MIM + * should not forward IntLinkChange and + * IntTopoChange if the frame was received at + * the interconnect port + */ + if (br_mrp_is_ring_port(p_port, s_port, p)) + i_dst = NULL; + + if (br_mrp_is_in_port(i_port, p)) + goto no_forward; + } + } + + if (mrp->in_role == BR_MRP_IN_ROLE_MIC) { + /* MIC should forward InTest frames on all ports + * regardless of the received port + */ + if (in_type == BR_MRP_TLV_HEADER_IN_TEST) + goto forward; + + /* MIC should forward IntLinkChange frames only if they + * are received on ring ports to all the ports + */ + if (br_mrp_is_ring_port(p_port, s_port, p) && + (in_type == BR_MRP_TLV_HEADER_IN_LINK_UP || + in_type == BR_MRP_TLV_HEADER_IN_LINK_DOWN)) + goto forward; + + /* Should forward the InTopo frames only between the + * ring ports + */ + if (in_type == BR_MRP_TLV_HEADER_IN_TOPO) { + i_dst = NULL; + goto forward; + } + + /* In all the other cases don't forward the frames */ + goto no_forward; + } + } - nskb->dev = d_dev; - skb_push(nskb, ETH_HLEN); - dev_queue_xmit(nskb); +forward: + if (p_dst) + br_forward(p_dst, skb, true, false); + if (s_dst) + br_forward(s_dst, skb, true, false); + if (i_dst) + br_forward(i_dst, skb, true, false); +no_forward: return 1; } diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 34b3a8776991..2a2fdf3500c5 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -14,6 +14,9 @@ static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = { [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_NESTED }, [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_IN_ROLE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_IN_STATE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_START_IN_TEST] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -235,6 +238,121 @@ static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, return br_mrp_start_test(br, &test); } +static const struct nla_policy +br_mrp_in_state_policy[IFLA_BRIDGE_MRP_IN_STATE_MAX + 1] = { + [IFLA_BRIDGE_MRP_IN_STATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_IN_STATE_IN_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_IN_STATE_STATE] = { .type = NLA_U32 }, +}; + +static int br_mrp_in_state_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_IN_STATE_MAX + 1]; + struct br_mrp_in_state state; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_IN_STATE_MAX, attr, + br_mrp_in_state_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_IN_STATE_IN_ID] || + !tb[IFLA_BRIDGE_MRP_IN_STATE_STATE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: IN_ID or STATE"); + return -EINVAL; + } + + memset(&state, 0x0, sizeof(state)); + + state.in_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_STATE_IN_ID]); + state.in_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_STATE_STATE]); + + return br_mrp_set_in_state(br, &state); +} + +static const struct nla_policy +br_mrp_in_role_policy[IFLA_BRIDGE_MRP_IN_ROLE_MAX + 1] = { + [IFLA_BRIDGE_MRP_IN_ROLE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_IN_ROLE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_IN_ROLE_IN_ID] = { .type = NLA_U16 }, + [IFLA_BRIDGE_MRP_IN_ROLE_ROLE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX] = { .type = NLA_U32 }, +}; + +static int br_mrp_in_role_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_IN_ROLE_MAX + 1]; + struct br_mrp_in_role role; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_IN_ROLE_MAX, attr, + br_mrp_in_role_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_IN_ROLE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_IN_ROLE_IN_ID] || + !tb[IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX] || + !tb[IFLA_BRIDGE_MRP_IN_ROLE_ROLE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or ROLE or IN_ID or I_IFINDEX"); + return -EINVAL; + } + + memset(&role, 0x0, sizeof(role)); + + role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_RING_ID]); + role.in_id = nla_get_u16(tb[IFLA_BRIDGE_MRP_IN_ROLE_IN_ID]); + role.i_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX]); + role.in_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_ROLE]); + + return br_mrp_set_in_role(br, &role); +} + +static const struct nla_policy +br_mrp_start_in_test_policy[IFLA_BRIDGE_MRP_START_IN_TEST_MAX + 1] = { + [IFLA_BRIDGE_MRP_START_IN_TEST_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD] = { .type = NLA_U32 }, +}; + +static int br_mrp_start_in_test_parse(struct net_bridge *br, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX + 1]; + struct br_mrp_start_in_test test; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_IN_TEST_MAX, attr, + br_mrp_start_in_test_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] || + !tb[IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] || + !tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] || + !tb[IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD"); + return -EINVAL; + } + + memset(&test, 0x0, sizeof(test)); + + test.in_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID]); + test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL]); + test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS]); + test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD]); + + return br_mrp_start_in_test(br, &test); +} + int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { @@ -301,10 +419,114 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, return err; } + if (tb[IFLA_BRIDGE_MRP_IN_STATE]) { + err = br_mrp_in_state_parse(br, tb[IFLA_BRIDGE_MRP_IN_STATE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_IN_ROLE]) { + err = br_mrp_in_role_parse(br, tb[IFLA_BRIDGE_MRP_IN_ROLE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_START_IN_TEST]) { + err = br_mrp_start_in_test_parse(br, + tb[IFLA_BRIDGE_MRP_START_IN_TEST], + extack); + if (err) + return err; + } + return 0; } -int br_mrp_port_open(struct net_device *dev, u8 loc) +int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br) +{ + struct nlattr *tb, *mrp_tb; + struct br_mrp *mrp; + + mrp_tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP); + if (!mrp_tb) + return -EMSGSIZE; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list) { + struct net_bridge_port *p; + + tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP_INFO); + if (!tb) + goto nla_info_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_ID, + mrp->ring_id)) + goto nla_put_failure; + + p = rcu_dereference(mrp->p_port); + if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_P_IFINDEX, + p->dev->ifindex)) + goto nla_put_failure; + + p = rcu_dereference(mrp->s_port); + if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_S_IFINDEX, + p->dev->ifindex)) + goto nla_put_failure; + + p = rcu_dereference(mrp->i_port); + if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_I_IFINDEX, + p->dev->ifindex)) + goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_BRIDGE_MRP_INFO_PRIO, + mrp->prio)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_STATE, + mrp->ring_state)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_ROLE, + mrp->ring_role)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL, + mrp->test_interval)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS, + mrp->test_max_miss)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_MONITOR, + mrp->test_monitor)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_STATE, + mrp->in_state)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_ROLE, + mrp->in_role)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_TEST_INTERVAL, + mrp->in_test_interval)) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_TEST_MAX_MISS, + mrp->in_test_max_miss)) + goto nla_put_failure; + + nla_nest_end(skb, tb); + } + nla_nest_end(skb, mrp_tb); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, tb); + +nla_info_failure: + nla_nest_cancel(skb, mrp_tb); + + return -EMSGSIZE; +} + +int br_mrp_ring_port_open(struct net_device *dev, u8 loc) { struct net_bridge_port *p; int err = 0; @@ -325,3 +547,25 @@ int br_mrp_port_open(struct net_device *dev, u8 loc) out: return err; } + +int br_mrp_in_port_open(struct net_device *dev, u8 loc) +{ + struct net_bridge_port *p; + int err = 0; + + p = br_port_get_rcu(dev); + if (!p) { + err = -EINVAL; + goto out; + } + + if (loc) + p->flags |= BR_MRP_LOST_IN_CONT; + else + p->flags &= ~BR_MRP_LOST_IN_CONT; + + br_ifinfo_notify(RTM_NEWLINK, NULL, p); + +out: + return err; +} diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index 0da68a0da4b5..ed547e03ace1 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -107,6 +107,68 @@ int br_mrp_switchdev_set_ring_state(struct net_bridge *br, return 0; } +int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, + u16 in_id, u32 ring_id, + enum br_mrp_in_role_type role) +{ + struct switchdev_obj_in_role_mrp mrp_role = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_IN_ROLE_MRP, + .in_role = role, + .in_id = mrp->in_id, + .ring_id = mrp->ring_id, + .i_port = rtnl_dereference(mrp->i_port)->dev, + }; + int err; + + if (role == BR_MRP_IN_ROLE_DISABLED) + err = switchdev_port_obj_del(br->dev, &mrp_role.obj); + else + err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL); + + return err; +} + +int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_in_state_type state) +{ + struct switchdev_obj_in_state_mrp mrp_state = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_IN_STATE_MRP, + .in_state = state, + .in_id = mrp->in_id, + }; + int err; + + err = switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period) +{ + struct switchdev_obj_in_test_mrp test = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_IN_TEST_MRP, + .interval = interval, + .max_miss = max_miss, + .in_id = mrp->in_id, + .period = period, + }; + int err; + + if (interval == 0) + err = switchdev_port_obj_del(br->dev, &test.obj); + else + err = switchdev_port_obj_add(br->dev, &test.obj, NULL); + + return err; +} + int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, enum br_mrp_port_state_type state) { diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 240e260e3461..147d52596e17 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void) #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + 0; } @@ -216,6 +217,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, !!(p->flags & BR_NEIGH_SUPPRESS)) || nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & BR_MRP_LOST_CONT)) || + nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, + !!(p->flags & BR_MRP_LOST_IN_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; @@ -453,6 +456,28 @@ static int br_fill_ifinfo(struct sk_buff *skb, rcu_read_unlock(); if (err) goto nla_put_failure; + + nla_nest_end(skb, af); + } + + if (filter_mask & RTEXT_FILTER_MRP) { + struct nlattr *af; + int err; + + if (!br_mrp_enabled(br) || port) + goto done; + + af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); + if (!af) + goto nla_put_failure; + + rcu_read_lock(); + err = br_mrp_fill_info(skb, br); + rcu_read_unlock(); + + if (err) + goto nla_put_failure; + nla_nest_end(skb, af); } @@ -516,7 +541,8 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_bridge_port *port = br_port_get_rtnl(dev); if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) && - !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) + !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) && + !(filter_mask & RTEXT_FILTER_MRP)) return 0; return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c index 162998e2f039..8914290c75d4 100644 --- a/net/bridge/br_netlink_tunnel.c +++ b/net/bridge/br_netlink_tunnel.c @@ -250,6 +250,36 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr, return 0; } +/* send a notification if v_curr can't enter the range and start a new one */ +static void __vlan_tunnel_handle_range(const struct net_bridge_port *p, + struct net_bridge_vlan **v_start, + struct net_bridge_vlan **v_end, + int v_curr, bool curr_change) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + + vg = nbp_vlan_group(p); + if (!vg) + return; + + v = br_vlan_find(vg, v_curr); + + if (!*v_start) + goto out_init; + + if (v && curr_change && br_vlan_can_enter_range(v, *v_end)) { + *v_end = v; + return; + } + + br_vlan_notify(p->br, p, (*v_start)->vid, (*v_end)->vid, RTM_NEWVLAN); +out_init: + /* we start a range only if there are any changes to notify about */ + *v_start = curr_change ? v : NULL; + *v_end = *v_start; +} + int br_process_vlan_tunnel_info(const struct net_bridge *br, const struct net_bridge_port *p, int cmd, struct vtunnel_info *tinfo_curr, @@ -263,6 +293,7 @@ int br_process_vlan_tunnel_info(const struct net_bridge *br, return -EINVAL; memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info)); } else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) { + struct net_bridge_vlan *v_start = NULL, *v_end = NULL; int t, v; if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN)) @@ -272,11 +303,24 @@ int br_process_vlan_tunnel_info(const struct net_bridge *br, return -EINVAL; t = tinfo_last->tunid; for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) { - err = br_vlan_tunnel_info(p, cmd, v, t, changed); + bool curr_change = false; + + err = br_vlan_tunnel_info(p, cmd, v, t, &curr_change); if (err) - return err; + break; t++; + + if (curr_change) + *changed = curr_change; + __vlan_tunnel_handle_range(p, &v_start, &v_end, v, + curr_change); } + if (v_start && v_end) + br_vlan_notify(br, p, v_start->vid, v_end->vid, + RTM_NEWVLAN); + if (err) + return err; + memset(tinfo_last, 0, sizeof(struct vtunnel_info)); memset(tinfo_curr, 0, sizeof(struct vtunnel_info)); } else { @@ -286,6 +330,7 @@ int br_process_vlan_tunnel_info(const struct net_bridge *br, tinfo_curr->tunid, changed); if (err) return err; + br_vlan_notify(br, p, tinfo_curr->vid, 0, RTM_NEWVLAN); memset(tinfo_last, 0, sizeof(struct vtunnel_info)); memset(tinfo_curr, 0, sizeof(struct vtunnel_info)); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e0ea6dbbc97e..baa1500f384f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -48,6 +48,8 @@ enum { /* Path to usermode spanning tree program */ #define BR_STP_PROG "/sbin/bridge-stp" +#define BR_FDB_NOTIFY_SETTABLE_BITS (FDB_NOTIFY_BIT | FDB_NOTIFY_INACTIVE_BIT) + typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; @@ -184,6 +186,8 @@ enum { BR_FDB_ADDED_BY_USER, BR_FDB_ADDED_BY_EXT_LEARN, BR_FDB_OFFLOADED, + BR_FDB_NOTIFY, + BR_FDB_NOTIFY_INACTIVE }; struct net_bridge_fdb_key { @@ -1196,6 +1200,12 @@ static inline void br_vlan_notify(const struct net_bridge *br, int cmd) { } + +static inline bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end) +{ + return true; +} #endif /* br_vlan_options.c */ @@ -1313,6 +1323,7 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb); bool br_mrp_enabled(struct net_bridge *br); void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p); +int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br); #else static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, @@ -1335,6 +1346,12 @@ static inline void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p) { } + +static inline int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br) +{ + return 0; +} + #endif /* br_netlink.c */ diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 315eb37d89f0..af0e9eff6549 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -12,8 +12,10 @@ struct br_mrp { struct net_bridge_port __rcu *p_port; struct net_bridge_port __rcu *s_port; + struct net_bridge_port __rcu *i_port; u32 ring_id; + u16 in_id; u16 prio; enum br_mrp_ring_role_type ring_role; @@ -21,6 +23,11 @@ struct br_mrp { enum br_mrp_ring_state_type ring_state; u32 ring_transitions; + enum br_mrp_in_role_type in_role; + u8 in_role_offloaded; + enum br_mrp_in_state_type in_state; + u32 in_transitions; + struct delayed_work test_work; u32 test_interval; unsigned long test_end; @@ -28,6 +35,12 @@ struct br_mrp { u32 test_max_miss; bool test_monitor; + struct delayed_work in_test_work; + u32 in_test_interval; + unsigned long in_test_end; + u32 in_test_count_miss; + u32 in_test_max_miss; + u32 seq_id; struct rcu_head rcu; @@ -44,6 +57,10 @@ int br_mrp_set_ring_state(struct net_bridge *br, struct br_mrp_ring_state *state); int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role); int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test); +int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state); +int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role); +int br_mrp_start_in_test(struct net_bridge *br, + struct br_mrp_start_in_test *test); /* br_mrp_switchdev.c */ int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp); @@ -59,8 +76,16 @@ int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, enum br_mrp_port_state_type state); int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, enum br_mrp_port_role_type role); +int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp, + u16 in_id, u32 ring_id, + enum br_mrp_in_role_type role); +int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_in_state_type state); +int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period); /* br_mrp_netlink.c */ -int br_mrp_port_open(struct net_device *dev, u8 loc); +int br_mrp_ring_port_open(struct net_device *dev, u8 loc); +int br_mrp_in_port_open(struct net_device *dev, u8 loc); #endif /* _BR_PRIVATE_MRP_H */ diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index b13b49b9f75c..1641f414d1ba 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1063,14 +1063,13 @@ free_counterstmp: } /* replace the table */ -static int do_replace(struct net *net, const void __user *user, - unsigned int len) +static int do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret, countersize; struct ebt_table_info *newinfo; struct ebt_replace tmp; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; if (len != sizeof(tmp) + tmp.entries_size) @@ -1242,9 +1241,8 @@ void ebt_unregister_table(struct net *net, struct ebt_table *table, /* userspace just supplied us with counters */ static int do_update_counters(struct net *net, const char *name, - struct ebt_counter __user *counters, - unsigned int num_counters, - const void __user *user, unsigned int len) + struct ebt_counter __user *counters, + unsigned int num_counters, unsigned int len) { int i, ret; struct ebt_counter *tmp; @@ -1287,19 +1285,18 @@ free_tmp: return ret; } -static int update_counters(struct net *net, const void __user *user, - unsigned int len) +static int update_counters(struct net *net, sockptr_t arg, unsigned int len) { struct ebt_replace hlp; - if (copy_from_user(&hlp, user, sizeof(hlp))) + if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter)) return -EINVAL; return do_update_counters(net, hlp.name, hlp.counters, - hlp.num_counters, user, len); + hlp.num_counters, len); } static inline int ebt_obj_to_user(char __user *um, const char *_name, @@ -1451,86 +1448,6 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user, ebt_entry_to_user, entries, tmp.entries); } -static int do_ebt_set_ctl(struct sock *sk, - int cmd, void __user *user, unsigned int len) -{ - int ret; - struct net *net = sock_net(sk); - - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case EBT_SO_SET_ENTRIES: - ret = do_replace(net, user, len); - break; - case EBT_SO_SET_COUNTERS: - ret = update_counters(net, user, len); - break; - default: - ret = -EINVAL; - } - return ret; -} - -static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - int ret; - struct ebt_replace tmp; - struct ebt_table *t; - struct net *net = sock_net(sk); - - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - if (copy_from_user(&tmp, user, sizeof(tmp))) - return -EFAULT; - - tmp.name[sizeof(tmp.name) - 1] = '\0'; - - t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); - if (!t) - return ret; - - switch (cmd) { - case EBT_SO_GET_INFO: - case EBT_SO_GET_INIT_INFO: - if (*len != sizeof(struct ebt_replace)) { - ret = -EINVAL; - mutex_unlock(&ebt_mutex); - break; - } - if (cmd == EBT_SO_GET_INFO) { - tmp.nentries = t->private->nentries; - tmp.entries_size = t->private->entries_size; - tmp.valid_hooks = t->valid_hooks; - } else { - tmp.nentries = t->table->nentries; - tmp.entries_size = t->table->entries_size; - tmp.valid_hooks = t->table->valid_hooks; - } - mutex_unlock(&ebt_mutex); - if (copy_to_user(user, &tmp, *len) != 0) { - ret = -EFAULT; - break; - } - ret = 0; - break; - - case EBT_SO_GET_ENTRIES: - case EBT_SO_GET_INIT_ENTRIES: - ret = copy_everything_to_user(t, user, len, cmd); - mutex_unlock(&ebt_mutex); - break; - - default: - mutex_unlock(&ebt_mutex); - ret = -EINVAL; - } - - return ret; -} - #ifdef CONFIG_COMPAT /* 32 bit-userspace compatibility definitions. */ struct compat_ebt_replace { @@ -1935,7 +1852,7 @@ static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, size_kern = match_size; module_put(match->me); break; - case EBT_COMPAT_WATCHER: /* fallthrough */ + case EBT_COMPAT_WATCHER: case EBT_COMPAT_TARGET: wt = xt_request_find_target(NFPROTO_BRIDGE, name, mwt->u.revision); @@ -2160,7 +2077,7 @@ static int compat_copy_entries(unsigned char *data, unsigned int size_user, static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, - void __user *user, unsigned int len) + sockptr_t arg, unsigned int len) { struct compat_ebt_replace tmp; int i; @@ -2168,7 +2085,7 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, if (len < sizeof(tmp)) return -EINVAL; - if (copy_from_user(&tmp, user, sizeof(tmp))) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp))) return -EFAULT; if (len != sizeof(tmp) + tmp.entries_size) @@ -2195,8 +2112,7 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, return 0; } -static int compat_do_replace(struct net *net, void __user *user, - unsigned int len) +static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret, i, countersize, size64; struct ebt_table_info *newinfo; @@ -2204,10 +2120,10 @@ static int compat_do_replace(struct net *net, void __user *user, struct ebt_entries_buf_state state; void *entries_tmp; - ret = compat_copy_ebt_replace_from_user(&tmp, user, len); + ret = compat_copy_ebt_replace_from_user(&tmp, arg, len); if (ret) { /* try real handler in case userland supplied needed padding */ - if (ret == -EINVAL && do_replace(net, user, len) == 0) + if (ret == -EINVAL && do_replace(net, arg, len) == 0) ret = 0; return ret; } @@ -2298,42 +2214,20 @@ out_unlock: goto free_entries; } -static int compat_update_counters(struct net *net, void __user *user, +static int compat_update_counters(struct net *net, sockptr_t arg, unsigned int len) { struct compat_ebt_replace hlp; - if (copy_from_user(&hlp, user, sizeof(hlp))) + if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; /* try real handler in case userland supplied needed padding */ if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter)) - return update_counters(net, user, len); + return update_counters(net, arg, len); return do_update_counters(net, hlp.name, compat_ptr(hlp.counters), - hlp.num_counters, user, len); -} - -static int compat_do_ebt_set_ctl(struct sock *sk, - int cmd, void __user *user, unsigned int len) -{ - int ret; - struct net *net = sock_net(sk); - - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case EBT_SO_SET_ENTRIES: - ret = compat_do_replace(net, user, len); - break; - case EBT_SO_SET_COUNTERS: - ret = compat_update_counters(net, user, len); - break; - default: - ret = -EINVAL; - } - return ret; + hlp.num_counters, len); } static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, @@ -2344,14 +2238,6 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, struct ebt_table *t; struct net *net = sock_net(sk); - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - /* try real handler in case userland supplied needed padding */ - if ((cmd == EBT_SO_GET_INFO || - cmd == EBT_SO_GET_INIT_INFO) && *len != sizeof(tmp)) - return do_ebt_get_ctl(sk, cmd, user, len); - if (copy_from_user(&tmp, user, sizeof(tmp))) return -EFAULT; @@ -2413,20 +2299,112 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, } #endif +static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + struct net *net = sock_net(sk); + struct ebt_replace tmp; + struct ebt_table *t; + int ret; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + +#ifdef CONFIG_COMPAT + /* try real handler in case userland supplied needed padding */ + if (in_compat_syscall() && + ((cmd != EBT_SO_GET_INFO && cmd != EBT_SO_GET_INIT_INFO) || + *len != sizeof(tmp))) + return compat_do_ebt_get_ctl(sk, cmd, user, len); +#endif + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + tmp.name[sizeof(tmp.name) - 1] = '\0'; + + t = find_table_lock(net, tmp.name, &ret, &ebt_mutex); + if (!t) + return ret; + + switch (cmd) { + case EBT_SO_GET_INFO: + case EBT_SO_GET_INIT_INFO: + if (*len != sizeof(struct ebt_replace)) { + ret = -EINVAL; + mutex_unlock(&ebt_mutex); + break; + } + if (cmd == EBT_SO_GET_INFO) { + tmp.nentries = t->private->nentries; + tmp.entries_size = t->private->entries_size; + tmp.valid_hooks = t->valid_hooks; + } else { + tmp.nentries = t->table->nentries; + tmp.entries_size = t->table->entries_size; + tmp.valid_hooks = t->table->valid_hooks; + } + mutex_unlock(&ebt_mutex); + if (copy_to_user(user, &tmp, *len) != 0) { + ret = -EFAULT; + break; + } + ret = 0; + break; + + case EBT_SO_GET_ENTRIES: + case EBT_SO_GET_INIT_ENTRIES: + ret = copy_everything_to_user(t, user, len, cmd); + mutex_unlock(&ebt_mutex); + break; + + default: + mutex_unlock(&ebt_mutex); + ret = -EINVAL; + } + + return ret; +} + +static int do_ebt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, + unsigned int len) +{ + struct net *net = sock_net(sk); + int ret; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case EBT_SO_SET_ENTRIES: +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_do_replace(net, arg, len); + else +#endif + ret = do_replace(net, arg, len); + break; + case EBT_SO_SET_COUNTERS: +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_update_counters(net, arg, len); + else +#endif + ret = update_counters(net, arg, len); + break; + default: + ret = -EINVAL; + } + return ret; +} + static struct nf_sockopt_ops ebt_sockopts = { .pf = PF_INET, .set_optmin = EBT_BASE_CTL, .set_optmax = EBT_SO_SET_MAX + 1, .set = do_ebt_set_ctl, -#ifdef CONFIG_COMPAT - .compat_set = compat_do_ebt_set_ctl, -#endif .get_optmin = EBT_BASE_CTL, .get_optmax = EBT_SO_GET_MAX + 1, .get = do_ebt_get_ctl, -#ifdef CONFIG_COMPAT - .compat_get = compat_do_ebt_get_ctl, -#endif .owner = THIS_MODULE, }; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index ef14da50a981..3ad0a1df6712 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -669,8 +669,8 @@ out_err: return sent ? : err; } -static int setsockopt(struct socket *sock, - int lvl, int opt, char __user *ov, unsigned int ol) +static int setsockopt(struct socket *sock, int lvl, int opt, sockptr_t ov, + unsigned int ol) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); @@ -685,7 +685,7 @@ static int setsockopt(struct socket *sock, return -EINVAL; if (lvl != SOL_CAIF) goto bad_sol; - if (copy_from_user(&linksel, ov, sizeof(int))) + if (copy_from_sockptr(&linksel, ov, sizeof(int))) return -EINVAL; lock_sock(&(cf_sk->sk)); cf_sk->conn_req.link_selector = linksel; @@ -699,7 +699,7 @@ static int setsockopt(struct socket *sock, return -ENOPROTOOPT; lock_sock(&(cf_sk->sk)); if (ol > sizeof(cf_sk->conn_req.param.data) || - copy_from_user(&cf_sk->conn_req.param.data, ov, ol)) { + copy_from_sockptr(&cf_sk->conn_req.param.data, ov, ol)) { release_sock(&cf_sk->sk); return -EINVAL; } @@ -981,7 +981,6 @@ static const struct proto_ops caif_seqpacket_ops = { .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = caif_seqpkt_sendmsg, .recvmsg = caif_seqpkt_recvmsg, .mmap = sock_no_mmap, @@ -1002,7 +1001,6 @@ static const struct proto_ops caif_stream_ops = { .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = caif_stream_sendmsg, .recvmsg = caif_stream_recvmsg, .mmap = sock_no_mmap, diff --git a/net/can/af_can.c b/net/can/af_can.c index 128d37a4c2e0..5c06404bdf3e 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -410,6 +410,7 @@ static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask, /** * can_rx_register - subscribe CAN frames from a specific interface + * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => subcribe from 'all' CAN devices list) * @can_id: CAN identifier (see description) * @mask: CAN mask (see description) @@ -498,6 +499,7 @@ static void can_rx_delete_receiver(struct rcu_head *rp) /** * can_rx_unregister - unsubscribe CAN frames from a specific interface + * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => unsubscribe from 'all' CAN devices list) * @can_id: CAN identifier * @mask: CAN mask diff --git a/net/can/bcm.c b/net/can/bcm.c index c96fa0f33db3..d14ea12affb1 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1648,8 +1648,6 @@ static const struct proto_ops bcm_ops = { .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = bcm_sendmsg, .recvmsg = bcm_recvmsg, .mmap = sock_no_mmap, diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index f7587428febd..78ff9b3f1d40 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -627,14 +627,14 @@ static int j1939_sk_release(struct socket *sock) return 0; } -static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, char __user *optval, +static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval, unsigned int optlen, int flag) { int tmp; if (optlen != sizeof(tmp)) return -EINVAL; - if (copy_from_user(&tmp, optval, optlen)) + if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; lock_sock(&jsk->sk); if (tmp) @@ -646,7 +646,7 @@ static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, char __user *optval, } static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); @@ -658,7 +658,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case SO_J1939_FILTER: - if (optval) { + if (!sockptr_is_null(optval)) { struct j1939_filter *f; int c; @@ -670,7 +670,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, return -EINVAL; count = optlen / sizeof(*filters); - filters = memdup_user(optval, optlen); + filters = memdup_sockptr(optval, optlen); if (IS_ERR(filters)) return PTR_ERR(filters); @@ -703,7 +703,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, case SO_J1939_SEND_PRIO: if (optlen != sizeof(tmp)) return -EINVAL; - if (copy_from_user(&tmp, optval, optlen)) + if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; if (tmp < 0 || tmp > 7) return -EDOM; diff --git a/net/can/raw.c b/net/can/raw.c index 59c039d73c6d..94a9405658dc 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -485,7 +485,7 @@ static int raw_getname(struct socket *sock, struct sockaddr *uaddr, } static int raw_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); @@ -511,11 +511,11 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (count > 1) { /* filter does not fit into dfilter => alloc space */ - filter = memdup_user(optval, optlen); + filter = memdup_sockptr(optval, optlen); if (IS_ERR(filter)) return PTR_ERR(filter); } else if (count == 1) { - if (copy_from_user(&sfilter, optval, sizeof(sfilter))) + if (copy_from_sockptr(&sfilter, optval, sizeof(sfilter))) return -EFAULT; } @@ -568,7 +568,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(err_mask)) return -EINVAL; - if (copy_from_user(&err_mask, optval, optlen)) + if (copy_from_sockptr(&err_mask, optval, optlen)) return -EFAULT; err_mask &= CAN_ERR_MASK; @@ -607,7 +607,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->loopback)) return -EINVAL; - if (copy_from_user(&ro->loopback, optval, optlen)) + if (copy_from_sockptr(&ro->loopback, optval, optlen)) return -EFAULT; break; @@ -616,7 +616,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->recv_own_msgs)) return -EINVAL; - if (copy_from_user(&ro->recv_own_msgs, optval, optlen)) + if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen)) return -EFAULT; break; @@ -625,7 +625,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->fd_frames)) return -EINVAL; - if (copy_from_user(&ro->fd_frames, optval, optlen)) + if (copy_from_sockptr(&ro->fd_frames, optval, optlen)) return -EFAULT; break; @@ -634,7 +634,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (optlen != sizeof(ro->join_filters)) return -EINVAL; - if (copy_from_user(&ro->join_filters, optval, optlen)) + if (copy_from_sockptr(&ro->join_filters, optval, optlen)) return -EFAULT; break; diff --git a/net/compat.c b/net/compat.c index 77f3a0e98fd0..703acb51c698 100644 --- a/net/compat.c +++ b/net/compat.c @@ -330,120 +330,6 @@ void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm) __scm_destroy(scm); } -/* allocate a 64-bit sock_fprog on the user stack for duration of syscall. */ -struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval) -{ - struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval; - struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog)); - struct compat_sock_fprog f32; - struct sock_fprog f; - - if (copy_from_user(&f32, fprog32, sizeof(*fprog32))) - return NULL; - memset(&f, 0, sizeof(f)); - f.len = f32.len; - f.filter = compat_ptr(f32.filter); - if (copy_to_user(kfprog, &f, sizeof(struct sock_fprog))) - return NULL; - - return kfprog; -} -EXPORT_SYMBOL_GPL(get_compat_bpf_fprog); - -static int do_set_attach_filter(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - struct sock_fprog __user *kfprog; - - kfprog = get_compat_bpf_fprog(optval); - if (!kfprog) - return -EFAULT; - - return sock_setsockopt(sock, level, optname, (char __user *)kfprog, - sizeof(struct sock_fprog)); -} - -static int compat_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (optname == SO_ATTACH_FILTER || - optname == SO_ATTACH_REUSEPORT_CBPF) - return do_set_attach_filter(sock, level, optname, - optval, optlen); - return sock_setsockopt(sock, level, optname, optval, optlen); -} - -static int __compat_sys_setsockopt(int fd, int level, int optname, - char __user *optval, unsigned int optlen) -{ - int err; - struct socket *sock; - - if (optlen > INT_MAX) - return -EINVAL; - - sock = sockfd_lookup(fd, &err); - if (sock) { - err = security_socket_setsockopt(sock, level, optname); - if (err) { - sockfd_put(sock); - return err; - } - - if (level == SOL_SOCKET) - err = compat_sock_setsockopt(sock, level, - optname, optval, optlen); - else if (sock->ops->compat_setsockopt) - err = sock->ops->compat_setsockopt(sock, level, - optname, optval, optlen); - else - err = sock->ops->setsockopt(sock, level, - optname, optval, optlen); - sockfd_put(sock); - } - return err; -} - -COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, - char __user *, optval, unsigned int, optlen) -{ - return __compat_sys_setsockopt(fd, level, optname, optval, optlen); -} - -static int __compat_sys_getsockopt(int fd, int level, int optname, - char __user *optval, - int __user *optlen) -{ - int err; - struct socket *sock = sockfd_lookup(fd, &err); - - if (sock) { - err = security_socket_getsockopt(sock, level, optname); - if (err) { - sockfd_put(sock); - return err; - } - - if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, - optname, optval, optlen); - else if (sock->ops->compat_getsockopt) - err = sock->ops->compat_getsockopt(sock, level, - optname, optval, optlen); - else - err = sock->ops->getsockopt(sock, level, - optname, optval, optlen); - sockfd_put(sock); - } - return err; -} - -COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, - char __user *, optval, int __user *, optlen) -{ - return __compat_sys_getsockopt(fd, level, optname, optval, optlen); -} - /* Argument list sizes for compat_sys_socketcall */ #define AL(x) ((x) * sizeof(u32)) static unsigned char nas[21] = { @@ -603,13 +489,11 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) ret = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: - ret = __compat_sys_setsockopt(a0, a1, a[2], - compat_ptr(a[3]), a[4]); + ret = __sys_setsockopt(a0, a1, a[2], compat_ptr(a[3]), a[4]); break; case SYS_GETSOCKOPT: - ret = __compat_sys_getsockopt(a0, a1, a[2], - compat_ptr(a[3]), - compat_ptr(a[4])); + ret = __sys_getsockopt(a0, a1, a[2], compat_ptr(a[3]), + compat_ptr(a[4])); break; case SYS_SENDMSG: ret = __compat_sys_sendmsg(a0, compat_ptr(a1), a[2]); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d2c4d16dadba..d3377c90a291 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -6,13 +6,12 @@ #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> +#include <linux/btf_ids.h> #include <net/bpf_sk_storage.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> -static atomic_t cache_idx; - #define SK_STORAGE_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_CLONE) @@ -81,6 +80,9 @@ struct bpf_sk_storage_elem { #define SDATA(_SELEM) (&(_SELEM)->sdata) #define BPF_SK_STORAGE_CACHE_SIZE 16 +static DEFINE_SPINLOCK(cache_idx_lock); +static u64 cache_idx_usage_counts[BPF_SK_STORAGE_CACHE_SIZE]; + struct bpf_sk_storage { struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE]; struct hlist_head list; /* List of bpf_sk_storage_elem */ @@ -512,6 +514,37 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map) return 0; } +static u16 cache_idx_get(void) +{ + u64 min_usage = U64_MAX; + u16 i, res = 0; + + spin_lock(&cache_idx_lock); + + for (i = 0; i < BPF_SK_STORAGE_CACHE_SIZE; i++) { + if (cache_idx_usage_counts[i] < min_usage) { + min_usage = cache_idx_usage_counts[i]; + res = i; + + /* Found a free cache_idx */ + if (!min_usage) + break; + } + } + cache_idx_usage_counts[res]++; + + spin_unlock(&cache_idx_lock); + + return res; +} + +static void cache_idx_free(u16 idx) +{ + spin_lock(&cache_idx_lock); + cache_idx_usage_counts[idx]--; + spin_unlock(&cache_idx_lock); +} + /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { @@ -560,6 +593,8 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) smap = (struct bpf_sk_storage_map *)map; + cache_idx_free(smap->cache_idx); + /* Note that this map might be concurrently cloned from * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone * RCU read section to finish before proceeding. New RCU @@ -673,8 +708,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) } smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; - smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % - BPF_SK_STORAGE_CACHE_SIZE; + smap->cache_idx = cache_idx_get(); return &smap->map; } @@ -886,6 +920,7 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) return -ENOENT; } +static int sk_storage_map_btf_id; const struct bpf_map_ops sk_storage_map_ops = { .map_alloc_check = bpf_sk_storage_map_alloc_check, .map_alloc = bpf_sk_storage_map_alloc, @@ -895,6 +930,8 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, .map_check_btf = bpf_sk_storage_map_check_btf, + .map_btf_name = "bpf_sk_storage_map", + .map_btf_id = &sk_storage_map_btf_id, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { @@ -907,6 +944,16 @@ const struct bpf_func_proto bpf_sk_storage_get_proto = { .arg4_type = ARG_ANYTHING, }; +const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_CTX, /* context is 'struct sock' */ + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_sk_storage_delete_proto = { .func = bpf_sk_storage_delete, .gpl_only = false, @@ -1181,3 +1228,208 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, return err; } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put); + +struct bpf_iter_seq_sk_storage_map_info { + struct bpf_map *map; + unsigned int bucket_id; + unsigned skip_elems; +}; + +static struct bpf_sk_storage_elem * +bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, + struct bpf_sk_storage_elem *prev_selem) +{ + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_elem *selem; + u32 skip_elems = info->skip_elems; + struct bpf_sk_storage_map *smap; + u32 bucket_id = info->bucket_id; + u32 i, count, n_buckets; + struct bucket *b; + + smap = (struct bpf_sk_storage_map *)info->map; + n_buckets = 1U << smap->bucket_log; + if (bucket_id >= n_buckets) + return NULL; + + /* try to find next selem in the same bucket */ + selem = prev_selem; + count = 0; + while (selem) { + selem = hlist_entry_safe(selem->map_node.next, + struct bpf_sk_storage_elem, map_node); + if (!selem) { + /* not found, unlock and go to the next bucket */ + b = &smap->buckets[bucket_id++]; + raw_spin_unlock_bh(&b->lock); + skip_elems = 0; + break; + } + sk_storage = rcu_dereference_raw(selem->sk_storage); + if (sk_storage) { + info->skip_elems = skip_elems + count; + return selem; + } + count++; + } + + for (i = bucket_id; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + raw_spin_lock_bh(&b->lock); + count = 0; + hlist_for_each_entry(selem, &b->list, map_node) { + sk_storage = rcu_dereference_raw(selem->sk_storage); + if (sk_storage && count >= skip_elems) { + info->bucket_id = i; + info->skip_elems = count; + return selem; + } + count++; + } + raw_spin_unlock_bh(&b->lock); + skip_elems = 0; + } + + info->bucket_id = i; + info->skip_elems = 0; + return NULL; +} + +static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_sk_storage_elem *selem; + + selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL); + if (!selem) + return NULL; + + if (*pos == 0) + ++*pos; + return selem; +} + +static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + struct bpf_iter_seq_sk_storage_map_info *info = seq->private; + + ++*pos; + ++info->skip_elems; + return bpf_sk_storage_map_seq_find_next(seq->private, v); +} + +struct bpf_iter__bpf_sk_storage_map { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(struct sock *, sk); + __bpf_md_ptr(void *, value); +}; + +DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta, + struct bpf_map *map, struct sock *sk, + void *value) + +static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, + struct bpf_sk_storage_elem *selem) +{ + struct bpf_iter_seq_sk_storage_map_info *info = seq->private; + struct bpf_iter__bpf_sk_storage_map ctx = {}; + struct bpf_sk_storage *sk_storage; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, selem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (selem) { + sk_storage = rcu_dereference_raw(selem->sk_storage); + ctx.sk = sk_storage->sk; + ctx.value = SDATA(selem)->data; + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_sk_storage_map_seq_show(seq, v); +} + +static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_sk_storage_map_info *info = seq->private; + struct bpf_sk_storage_map *smap; + struct bucket *b; + + if (!v) { + (void)__bpf_sk_storage_map_seq_show(seq, v); + } else { + smap = (struct bpf_sk_storage_map *)info->map; + b = &smap->buckets[info->bucket_id]; + raw_spin_unlock_bh(&b->lock); + } +} + +static int bpf_iter_init_sk_storage_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; + + seq_info->map = aux->map; + return 0; +} + +static int bpf_iter_check_map(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux) +{ + struct bpf_map *map = aux->map; + + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) + return -EINVAL; + + if (prog->aux->max_rdonly_access > map->value_size) + return -EACCES; + + return 0; +} + +static const struct seq_operations bpf_sk_storage_map_seq_ops = { + .start = bpf_sk_storage_map_seq_start, + .next = bpf_sk_storage_map_seq_next, + .stop = bpf_sk_storage_map_seq_stop, + .show = bpf_sk_storage_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_sk_storage_map_seq_ops, + .init_seq_private = bpf_iter_init_sk_storage_map, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_sk_storage_map_info), +}; + +static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { + .target = "bpf_sk_storage_map", + .check_target = bpf_iter_check_map, + .req_linfo = BPF_ITER_LINK_MAP_FD, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__bpf_sk_storage_map, value), + PTR_TO_RDWR_BUF_OR_NULL }, + }, + .seq_info = &iter_seq_info, +}; + +static int __init bpf_sk_storage_map_iter_init(void) +{ + bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id = + btf_sock_ids[BTF_SOCK_TYPE_SOCK]; + return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info); +} +late_initcall(bpf_sk_storage_map_iter_init); diff --git a/net/core/dev.c b/net/core/dev.c index ba4de97b676b..7df6c9617321 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -143,6 +143,7 @@ #include <linux/net_namespace.h> #include <linux/indirect_call_wrapper.h> #include <net/devlink.h> +#include <linux/pm_runtime.h> #include "net-sysfs.h" @@ -1492,8 +1493,13 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) ASSERT_RTNL(); - if (!netif_device_present(dev)) - return -ENODEV; + if (!netif_device_present(dev)) { + /* may be detached because parent is runtime-suspended */ + if (dev->dev.parent) + pm_runtime_resume(dev->dev.parent); + if (!netif_device_present(dev)) + return -ENODEV; + } /* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller @@ -3448,10 +3454,9 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb, static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t features) { - int tmp; __be16 type; - type = skb_network_protocol(skb, &tmp); + type = skb_network_protocol(skb, NULL); features = net_mpls_features(skb, features, type); if (skb->ip_summed != CHECKSUM_NONE && @@ -5442,6 +5447,8 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) for (i = 0; i < new->aux->used_map_cnt; i++) { if (dev_map_can_have_prog(new->aux->used_maps[i])) return -EINVAL; + if (cpu_map_prog_allowed(new->aux->used_maps[i])) + return -EINVAL; } } @@ -5460,10 +5467,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) } break; - case XDP_QUERY_PROG: - xdp->prog_id = old ? old->aux->id : 0; - break; - default: ret = -EINVAL; break; @@ -5585,7 +5588,7 @@ void netif_receive_skb_list(struct list_head *head) } EXPORT_SYMBOL(netif_receive_skb_list); -DEFINE_PER_CPU(struct work_struct, flush_works); +static DEFINE_PER_CPU(struct work_struct, flush_works); /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) @@ -6685,7 +6688,9 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) trace_napi_poll(n, work, weight); } - WARN_ON_ONCE(work > weight); + if (unlikely(work > weight)) + pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n", + n->poll, work, weight); if (likely(work < weight)) goto out_unlock; @@ -8706,182 +8711,489 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); -u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, - enum bpf_netdev_command cmd) +/** + * dev_change_proto_down_reason - proto down reason + * + * @dev: device + * @mask: proto down mask + * @value: proto down value + */ +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value) { - struct netdev_bpf xdp; + int b; - if (!bpf_op) - return 0; + if (!mask) { + dev->proto_down_reason = value; + } else { + for_each_set_bit(b, &mask, 32) { + if (value & (1 << b)) + dev->proto_down_reason |= BIT(b); + else + dev->proto_down_reason &= ~BIT(b); + } + } +} +EXPORT_SYMBOL(dev_change_proto_down_reason); - memset(&xdp, 0, sizeof(xdp)); - xdp.command = cmd; +struct bpf_xdp_link { + struct bpf_link link; + struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ + int flags; +}; - /* Query must always succeed. */ - WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG); +static enum bpf_xdp_mode dev_xdp_mode(u32 flags) +{ + if (flags & XDP_FLAGS_HW_MODE) + return XDP_MODE_HW; + if (flags & XDP_FLAGS_DRV_MODE) + return XDP_MODE_DRV; + return XDP_MODE_SKB; +} - return xdp.prog_id; +static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) +{ + switch (mode) { + case XDP_MODE_SKB: + return generic_xdp_install; + case XDP_MODE_DRV: + case XDP_MODE_HW: + return dev->netdev_ops->ndo_bpf; + default: + return NULL; + }; +} + +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + return dev->xdp_state[mode].link; } -static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, - struct netlink_ext_ack *extack, u32 flags, - struct bpf_prog *prog) +static struct bpf_prog *dev_xdp_prog(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + struct bpf_xdp_link *link = dev_xdp_link(dev, mode); + + if (link) + return link->link.prog; + return dev->xdp_state[mode].prog; +} + +u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) +{ + struct bpf_prog *prog = dev_xdp_prog(dev, mode); + + return prog ? prog->aux->id : 0; +} + +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_xdp_link *link) +{ + dev->xdp_state[mode].link = link; + dev->xdp_state[mode].prog = NULL; +} + +static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_prog *prog) +{ + dev->xdp_state[mode].link = NULL; + dev->xdp_state[mode].prog = prog; +} + +static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, + bpf_op_t bpf_op, struct netlink_ext_ack *extack, + u32 flags, struct bpf_prog *prog) { - bool non_hw = !(flags & XDP_FLAGS_HW_MODE); - struct bpf_prog *prev_prog = NULL; struct netdev_bpf xdp; int err; - if (non_hw) { - prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op, - XDP_QUERY_PROG)); - if (IS_ERR(prev_prog)) - prev_prog = NULL; - } - memset(&xdp, 0, sizeof(xdp)); - if (flags & XDP_FLAGS_HW_MODE) - xdp.command = XDP_SETUP_PROG_HW; - else - xdp.command = XDP_SETUP_PROG; + xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; xdp.flags = flags; xdp.prog = prog; + /* Drivers assume refcnt is already incremented (i.e, prog pointer is + * "moved" into driver), so they don't increment it on their own, but + * they do decrement refcnt when program is detached or replaced. + * Given net_device also owns link/prog, we need to bump refcnt here + * to prevent drivers from underflowing it. + */ + if (prog) + bpf_prog_inc(prog); err = bpf_op(dev, &xdp); - if (!err && non_hw) - bpf_prog_change_xdp(prev_prog, prog); + if (err) { + if (prog) + bpf_prog_put(prog); + return err; + } - if (prev_prog) - bpf_prog_put(prev_prog); + if (mode != XDP_MODE_HW) + bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); - return err; + return 0; } static void dev_xdp_uninstall(struct net_device *dev) { - struct netdev_bpf xdp; - bpf_op_t ndo_bpf; + struct bpf_xdp_link *link; + struct bpf_prog *prog; + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; - /* Remove generic XDP */ - WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL)); + ASSERT_RTNL(); - /* Remove from the driver */ - ndo_bpf = dev->netdev_ops->ndo_bpf; - if (!ndo_bpf) - return; + for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { + prog = dev_xdp_prog(dev, mode); + if (!prog) + continue; - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; - WARN_ON(ndo_bpf(dev, &xdp)); - if (xdp.prog_id) - WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, - NULL)); + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) + continue; - /* Remove HW offload */ - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG_HW; - if (!ndo_bpf(dev, &xdp) && xdp.prog_id) - WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, - NULL)); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + + /* auto-detach link from net device */ + link = dev_xdp_link(dev, mode); + if (link) + link->dev = NULL; + else + bpf_prog_put(prog); + + dev_xdp_set_link(dev, mode, NULL); + } } -/** - * dev_change_xdp_fd - set or clear a bpf program for a device rx path - * @dev: device - * @extack: netlink extended ack - * @fd: new program fd or negative value to clear - * @expected_fd: old program fd that userspace expects to replace or clear - * @flags: xdp-related flags - * - * Set or clear a bpf program for a device - */ -int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, - int fd, int expected_fd, u32 flags) +static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, + struct bpf_xdp_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog, u32 flags) { - const struct net_device_ops *ops = dev->netdev_ops; - enum bpf_netdev_command query; - u32 prog_id, expected_id = 0; - bpf_op_t bpf_op, bpf_chk; - struct bpf_prog *prog; - bool offload; + struct bpf_prog *cur_prog; + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; int err; ASSERT_RTNL(); - offload = flags & XDP_FLAGS_HW_MODE; - query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; + /* either link or prog attachment, never both */ + if (link && (new_prog || old_prog)) + return -EINVAL; + /* link supports only XDP mode flags */ + if (link && (flags & ~XDP_FLAGS_MODES)) { + NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); + return -EINVAL; + } + /* just one XDP mode bit should be set, zero defaults to SKB mode */ + if (hweight32(flags & XDP_FLAGS_MODES) > 1) { + NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); + return -EINVAL; + } + /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ + if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { + NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); + return -EINVAL; + } - bpf_op = bpf_chk = ops->ndo_bpf; - if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) { - NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode"); - return -EOPNOTSUPP; + mode = dev_xdp_mode(flags); + /* can't replace attached link */ + if (dev_xdp_link(dev, mode)) { + NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); + return -EBUSY; } - if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE)) - bpf_op = generic_xdp_install; - if (bpf_op == bpf_chk) - bpf_chk = generic_xdp_install; - - prog_id = __dev_xdp_query(dev, bpf_op, query); - if (flags & XDP_FLAGS_REPLACE) { - if (expected_fd >= 0) { - prog = bpf_prog_get_type_dev(expected_fd, - BPF_PROG_TYPE_XDP, - bpf_op == ops->ndo_bpf); - if (IS_ERR(prog)) - return PTR_ERR(prog); - expected_id = prog->aux->id; - bpf_prog_put(prog); - } - if (prog_id != expected_id) { - NL_SET_ERR_MSG(extack, "Active program does not match expected"); - return -EEXIST; - } + cur_prog = dev_xdp_prog(dev, mode); + /* can't replace attached prog with link */ + if (link && cur_prog) { + NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); + return -EBUSY; + } + if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { + NL_SET_ERR_MSG(extack, "Active program does not match expected"); + return -EEXIST; + } + if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { + NL_SET_ERR_MSG(extack, "XDP program already attached"); + return -EBUSY; } - if (fd >= 0) { - if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) { - NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time"); - return -EEXIST; - } - if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) { - NL_SET_ERR_MSG(extack, "XDP program already attached"); - return -EBUSY; - } + /* put effective new program into new_prog */ + if (link) + new_prog = link->link.prog; - prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, - bpf_op == ops->ndo_bpf); - if (IS_ERR(prog)) - return PTR_ERR(prog); + if (new_prog) { + bool offload = mode == XDP_MODE_HW; + enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB + ? XDP_MODE_DRV : XDP_MODE_SKB; - if (!offload && bpf_prog_is_dev_bound(prog->aux)) { - NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported"); - bpf_prog_put(prog); + if (!offload && dev_xdp_prog(dev, other_mode)) { + NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); + return -EEXIST; + } + if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) { + NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported"); return -EINVAL; } - - if (prog->expected_attach_type == BPF_XDP_DEVMAP) { + if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); - bpf_prog_put(prog); return -EINVAL; } + if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); + return -EINVAL; + } + } - /* prog->aux->id may be 0 for orphaned device-bound progs */ - if (prog->aux->id && prog->aux->id == prog_id) { - bpf_prog_put(prog); - return 0; + /* don't call drivers if the effective program didn't change */ + if (new_prog != cur_prog) { + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) { + NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); + return -EOPNOTSUPP; + } + + err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); + if (err) + return err; + } + + if (link) + dev_xdp_set_link(dev, mode, link); + else + dev_xdp_set_prog(dev, mode, new_prog); + if (cur_prog) + bpf_prog_put(cur_prog); + + return 0; +} + +static int dev_xdp_attach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); +} + +static int dev_xdp_detach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + + ASSERT_RTNL(); + + mode = dev_xdp_mode(link->flags); + if (dev_xdp_link(dev, mode) != link) + return -EINVAL; + + bpf_op = dev_xdp_bpf_op(dev, mode); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + dev_xdp_set_link(dev, mode, NULL); + return 0; +} + +static void bpf_xdp_link_release(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + rtnl_lock(); + + /* if racing with net_device's tear down, xdp_link->dev might be + * already NULL, in which case link was already auto-detached + */ + if (xdp_link->dev) { + WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + xdp_link->dev = NULL; + } + + rtnl_unlock(); +} + +static int bpf_xdp_link_detach(struct bpf_link *link) +{ + bpf_xdp_link_release(link); + return 0; +} + +static void bpf_xdp_link_dealloc(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + kfree(xdp_link); +} + +static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); +} + +static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + info->xdp.ifindex = ifindex; + return 0; +} + +static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + int err = 0; + + rtnl_lock(); + + /* link might have been auto-released already, so fail */ + if (!xdp_link->dev) { + err = -ENOLINK; + goto out_unlock; + } + + if (old_prog && link->prog != old_prog) { + err = -EPERM; + goto out_unlock; + } + old_prog = link->prog; + if (old_prog == new_prog) { + /* no-op, don't disturb drivers */ + bpf_prog_put(new_prog); + goto out_unlock; + } + + mode = dev_xdp_mode(xdp_link->flags); + bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); + err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, + xdp_link->flags, new_prog); + if (err) + goto out_unlock; + + old_prog = xchg(&link->prog, new_prog); + bpf_prog_put(old_prog); + +out_unlock: + rtnl_unlock(); + return err; +} + +static const struct bpf_link_ops bpf_xdp_link_lops = { + .release = bpf_xdp_link_release, + .dealloc = bpf_xdp_link_dealloc, + .detach = bpf_xdp_link_detach, + .show_fdinfo = bpf_xdp_link_show_fdinfo, + .fill_link_info = bpf_xdp_link_fill_link_info, + .update_prog = bpf_xdp_link_update, +}; + +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_xdp_link *link; + struct net_device *dev; + int err, fd; + + dev = dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_dev; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + link->dev = dev; + link->flags = attr->link_create.flags; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto out_put_dev; + } + + rtnl_lock(); + err = dev_xdp_attach_link(dev, NULL, link); + rtnl_unlock(); + + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_dev; + } + + fd = bpf_link_settle(&link_primer); + /* link itself doesn't hold dev's refcnt to not complicate shutdown */ + dev_put(dev); + return fd; + +out_put_dev: + dev_put(dev); + return err; +} + +/** + * dev_change_xdp_fd - set or clear a bpf program for a device rx path + * @dev: device + * @extack: netlink extended ack + * @fd: new program fd or negative value to clear + * @expected_fd: old program fd that userspace expects to replace or clear + * @flags: xdp-related flags + * + * Set or clear a bpf program for a device + */ +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, int expected_fd, u32 flags) +{ + enum bpf_xdp_mode mode = dev_xdp_mode(flags); + struct bpf_prog *new_prog = NULL, *old_prog = NULL; + int err; + + ASSERT_RTNL(); + + if (fd >= 0) { + new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, + mode != XDP_MODE_SKB); + if (IS_ERR(new_prog)) + return PTR_ERR(new_prog); + } + + if (expected_fd >= 0) { + old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, + mode != XDP_MODE_SKB); + if (IS_ERR(old_prog)) { + err = PTR_ERR(old_prog); + old_prog = NULL; + goto err_out; } - } else { - if (!prog_id) - return 0; - prog = NULL; } - err = dev_xdp_install(dev, bpf_op, extack, flags, prog); - if (err < 0 && prog) - bpf_prog_put(prog); + err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); +err_out: + if (err && new_prog) + bpf_prog_put(new_prog); + if (old_prog) + bpf_prog_put(old_prog); return err; } diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 547b587c1950..b2cf9b7bb7b8 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -5,6 +5,7 @@ #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/wireless.h> +#include <net/dsa.h> #include <net/wext.h> /* @@ -225,6 +226,26 @@ static int net_hwtstamp_validate(struct ifreq *ifr) return 0; } +static int dev_do_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err = -EOPNOTSUPP; + + err = dsa_ndo_do_ioctl(dev, ifr, cmd); + if (err == 0 || err != -EOPNOTSUPP) + return err; + + if (ops->ndo_do_ioctl) { + if (netif_device_present(dev)) + err = ops->ndo_do_ioctl(dev, ifr, cmd); + else + err = -ENODEV; + } + + return err; +} + /* * Perform the SIOCxIFxxx calls, inside rtnl_lock() */ @@ -323,13 +344,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) cmd == SIOCSHWTSTAMP || cmd == SIOCGHWTSTAMP || cmd == SIOCWANDEV) { - err = -EOPNOTSUPP; - if (ops->ndo_do_ioctl) { - if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); - else - err = -ENODEV; - } + err = dev_do_ioctl(dev, ifr, cmd); } else err = -EINVAL; diff --git a/net/core/devlink.c b/net/core/devlink.c index 47f14a2f25fb..e674f0f46dc2 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -85,6 +85,10 @@ EXPORT_SYMBOL(devlink_dpipe_header_ipv6); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); +static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { + [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, +}; + static LIST_HEAD(devlink_list); /* devlink_mutex @@ -382,19 +386,19 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) return NULL; } -#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) -#define DEVLINK_NL_FLAG_NEED_PORT BIT(1) -#define DEVLINK_NL_FLAG_NEED_SB BIT(2) +#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) +#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) /* The per devlink instance lock is taken by default in the pre-doit * operation, yet several commands do not require this. The global * devlink lock is taken and protects from disruption by user-calls. */ -#define DEVLINK_NL_FLAG_NO_LOCK BIT(3) +#define DEVLINK_NL_FLAG_NO_LOCK BIT(2) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct devlink_port *devlink_port; struct devlink *devlink; int err; @@ -406,27 +410,18 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, } if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_lock(&devlink->lock); - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { - info->user_ptr[0] = devlink; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { - struct devlink_port *devlink_port; - + info->user_ptr[0] = devlink; + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { devlink_port = devlink_port_get_from_info(devlink, info); if (IS_ERR(devlink_port)) { err = PTR_ERR(devlink_port); goto unlock; } - info->user_ptr[0] = devlink_port; - } - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) { - struct devlink_sb *devlink_sb; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) { - err = PTR_ERR(devlink_sb); - goto unlock; - } - info->user_ptr[1] = devlink_sb; + info->user_ptr[1] = devlink_port; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { + devlink_port = devlink_port_get_from_info(devlink, info); + if (!IS_ERR(devlink_port)) + info->user_ptr[1] = devlink_port; } return 0; @@ -442,16 +437,8 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, { struct devlink *devlink; - /* When devlink changes netns, it would not be found - * by devlink_get_from_info(). So try if it is stored first. - */ - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { - devlink = info->user_ptr[0]; - } else { - devlink = devlink_get_from_info(info); - WARN_ON(IS_ERR(devlink)); - } - if (!IS_ERR(devlink) && ~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + devlink = info->user_ptr[0]; + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); } @@ -524,8 +511,14 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, { struct devlink_port_attrs *attrs = &devlink_port->attrs; - if (!attrs->set) + if (!devlink_port->attrs_set) return 0; + if (attrs->lanes) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) + return -EMSGSIZE; + } + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) + return -EMSGSIZE; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { @@ -563,10 +556,54 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return 0; } +static int +devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, + struct netlink_ext_ack *extack) +{ + struct devlink *devlink = port->devlink; + const struct devlink_ops *ops; + struct nlattr *function_attr; + bool empty_nest = true; + int err = 0; + + function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); + if (!function_attr) + return -EMSGSIZE; + + ops = devlink->ops; + if (ops->port_function_hw_addr_get) { + int hw_addr_len; + u8 hw_addr[MAX_ADDR_LEN]; + + err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack); + if (err == -EOPNOTSUPP) { + /* Port function attributes are optional for a port. If port doesn't + * support function attribute, returning -EOPNOTSUPP is not an error. + */ + err = 0; + goto out; + } else if (err) { + goto out; + } + err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); + if (err) + goto out; + empty_nest = false; + } + +out: + if (err || empty_nest) + nla_nest_cancel(msg, function_attr); + else + nla_nest_end(msg, function_attr); + return err; +} + static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, - u32 seq, int flags) + u32 seq, int flags, + struct netlink_ext_ack *extack) { void *hdr; @@ -607,6 +644,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, spin_unlock_bh(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; + if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) + goto nla_put_failure; genlmsg_end(msg, hdr); return 0; @@ -634,7 +673,8 @@ static void devlink_port_notify(struct devlink_port *devlink_port, if (!msg) return; - err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0); + err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0, + NULL); if (err) { nlmsg_free(msg); return; @@ -697,7 +737,7 @@ out: static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; int err; @@ -708,7 +748,8 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, err = devlink_nl_port_fill(msg, devlink, devlink_port, DEVLINK_CMD_PORT_NEW, - info->snd_portid, info->snd_seq, 0); + info->snd_portid, info->snd_seq, 0, + info->extack); if (err) { nlmsg_free(msg); return err; @@ -740,7 +781,8 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI); + NLM_F_MULTI, + cb->extack); if (err) { mutex_unlock(&devlink->lock); goto out; @@ -778,10 +820,71 @@ static int devlink_port_type_set(struct devlink *devlink, return -EOPNOTSUPP; } +static int +devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port *port, + const struct nlattr *attr, struct netlink_ext_ack *extack) +{ + const struct devlink_ops *ops; + const u8 *hw_addr; + int hw_addr_len; + int err; + + hw_addr = nla_data(attr); + hw_addr_len = nla_len(attr); + if (hw_addr_len > MAX_ADDR_LEN) { + NL_SET_ERR_MSG_MOD(extack, "Port function hardware address too long"); + return -EINVAL; + } + if (port->type == DEVLINK_PORT_TYPE_ETH) { + if (hw_addr_len != ETH_ALEN) { + NL_SET_ERR_MSG_MOD(extack, "Address must be 6 bytes for Ethernet device"); + return -EINVAL; + } + if (!is_unicast_ether_addr(hw_addr)) { + NL_SET_ERR_MSG_MOD(extack, "Non-unicast hardware address unsupported"); + return -EINVAL; + } + } + + ops = devlink->ops; + if (!ops->port_function_hw_addr_set) { + NL_SET_ERR_MSG_MOD(extack, "Port doesn't support function attributes"); + return -EOPNOTSUPP; + } + + err = ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack); + if (err) + return err; + + devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); + return 0; +} + +static int +devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, + const struct nlattr *attr, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, + devlink_function_nl_policy, extack); + if (err < 0) { + NL_SET_ERR_MSG_MOD(extack, "Fail to parse port function attributes"); + return err; + } + + attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; + if (attr) + err = devlink_port_function_hw_addr_set(devlink, port, attr, extack); + + return err; +} + static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; int err; @@ -793,6 +896,16 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, if (err) return err; } + + if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) { + struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; + struct netlink_ext_ack *extack = info->extack; + + err = devlink_port_function_set(devlink, devlink_port, attr, extack); + if (err) + return err; + } + return 0; } @@ -810,6 +923,7 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct devlink_port *devlink_port; u32 port_index; u32 count; @@ -817,8 +931,27 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]) return -EINVAL; + devlink_port = devlink_port_get_from_info(devlink, info); port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); + + if (IS_ERR(devlink_port)) + return -EINVAL; + + if (!devlink_port->attrs.splittable) { + /* Split ports cannot be split. */ + if (devlink_port->attrs.split) + NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split further"); + else + NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split"); + return -EINVAL; + } + + if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { + NL_SET_ERR_MSG_MOD(info->extack, "Invalid split count"); + return -EINVAL; + } + return devlink_port_split(devlink, port_index, count, info->extack); } @@ -886,10 +1019,14 @@ static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_sb *devlink_sb; struct sk_buff *msg; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -991,11 +1128,15 @@ static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_sb *devlink_sb; struct sk_buff *msg; u16 pool_index; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) @@ -1102,12 +1243,16 @@ static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; enum devlink_sb_threshold_type threshold_type; + struct devlink_sb *devlink_sb; u16 pool_index; u32 size; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) @@ -1186,13 +1331,17 @@ nla_put_failure: static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_sb *devlink_sb; struct sk_buff *msg; u16 pool_index; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) @@ -1304,12 +1453,17 @@ static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb; u16 pool_index; u32 threshold; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, &pool_index); if (err) @@ -1391,14 +1545,18 @@ nla_put_failure: static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_sb *devlink_sb; struct sk_buff *msg; enum devlink_sb_pool_type pool_type; u16 tc_index; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_type_get_from_info(info, &pool_type); if (err) return err; @@ -1540,14 +1698,19 @@ static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink *devlink = info->user_ptr[0]; enum devlink_sb_pool_type pool_type; + struct devlink_sb *devlink_sb; u16 tc_index; u16 pool_index; u32 threshold; int err; + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + err = devlink_sb_pool_type_get_from_info(info, &pool_type); if (err) return err; @@ -1575,8 +1738,12 @@ static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; const struct devlink_ops *ops = devlink->ops; + struct devlink_sb *devlink_sb; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); if (ops->sb_occ_snapshot) return ops->sb_occ_snapshot(devlink, devlink_sb->index); @@ -1587,8 +1754,12 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb = info->user_ptr[1]; const struct devlink_ops *ops = devlink->ops; + struct devlink_sb *devlink_sb; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); if (ops->sb_occ_max_clear) return ops->sb_occ_max_clear(devlink, devlink_sb->index); @@ -2772,7 +2943,7 @@ static void devlink_reload_netns_change(struct devlink *devlink, DEVLINK_CMD_PARAM_NEW); } -static bool devlink_reload_supported(struct devlink *devlink) +static bool devlink_reload_supported(const struct devlink *devlink) { return devlink->ops->reload_down && devlink->ops->reload_up; } @@ -2818,7 +2989,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) struct net *dest_net = NULL; int err; - if (!devlink_reload_supported(devlink) || !devlink->reload_enabled) + if (!devlink_reload_supported(devlink)) return -EOPNOTSUPP; err = devlink_resources_validate(devlink, NULL, info); @@ -4388,6 +4559,14 @@ int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) } EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); +int devlink_info_board_serial_number_put(struct devlink_info_req *req, + const char *bsn) +{ + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, + bsn); +} +EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); + static int devlink_info_version_put(struct devlink_info_req *req, int attr, const char *version_name, const char *version_value) @@ -5141,6 +5320,7 @@ struct devlink_health_reporter { void *priv; const struct devlink_health_reporter_ops *ops; struct devlink *devlink; + struct devlink_port *devlink_port; struct devlink_fmsg *dump_fmsg; struct mutex dump_lock; /* lock parallel read/write from dump buffers */ u64 graceful_period; @@ -5163,18 +5343,98 @@ devlink_health_reporter_priv(struct devlink_health_reporter *reporter) EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); static struct devlink_health_reporter * -devlink_health_reporter_find_by_name(struct devlink *devlink, - const char *reporter_name) +__devlink_health_reporter_find_by_name(struct list_head *reporter_list, + struct mutex *list_lock, + const char *reporter_name) { struct devlink_health_reporter *reporter; - lockdep_assert_held(&devlink->reporters_lock); - list_for_each_entry(reporter, &devlink->reporter_list, list) + lockdep_assert_held(list_lock); + list_for_each_entry(reporter, reporter_list, list) if (!strcmp(reporter->ops->name, reporter_name)) return reporter; return NULL; } +static struct devlink_health_reporter * +devlink_health_reporter_find_by_name(struct devlink *devlink, + const char *reporter_name) +{ + return __devlink_health_reporter_find_by_name(&devlink->reporter_list, + &devlink->reporters_lock, + reporter_name); +} + +static struct devlink_health_reporter * +devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, + const char *reporter_name) +{ + return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list, + &devlink_port->reporters_lock, + reporter_name); +} + +static struct devlink_health_reporter * +__devlink_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + + if (WARN_ON(graceful_period && !ops->recover)) + return ERR_PTR(-EINVAL); + + reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); + if (!reporter) + return ERR_PTR(-ENOMEM); + + reporter->priv = priv; + reporter->ops = ops; + reporter->devlink = devlink; + reporter->graceful_period = graceful_period; + reporter->auto_recover = !!ops->recover; + reporter->auto_dump = !!ops->dump; + mutex_init(&reporter->dump_lock); + refcount_set(&reporter->refcount, 1); + return reporter; +} + +/** + * devlink_port_health_reporter_create - create devlink health reporter for + * specified port instance + * + * @port: devlink_port which should contain the new reporter + * @ops: ops + * @graceful_period: to avoid recovery loops, in msecs + * @priv: priv + */ +struct devlink_health_reporter * +devlink_port_health_reporter_create(struct devlink_port *port, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv) +{ + struct devlink_health_reporter *reporter; + + mutex_lock(&port->reporters_lock); + if (__devlink_health_reporter_find_by_name(&port->reporter_list, + &port->reporters_lock, ops->name)) { + reporter = ERR_PTR(-EEXIST); + goto unlock; + } + + reporter = __devlink_health_reporter_create(port->devlink, ops, + graceful_period, priv); + if (IS_ERR(reporter)) + goto unlock; + + reporter->devlink_port = port; + list_add_tail(&reporter->list, &port->reporter_list); +unlock: + mutex_unlock(&port->reporters_lock); + return reporter; +} +EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); + /** * devlink_health_reporter_create - create devlink health reporter * @@ -5196,25 +5456,11 @@ devlink_health_reporter_create(struct devlink *devlink, goto unlock; } - if (WARN_ON(graceful_period && !ops->recover)) { - reporter = ERR_PTR(-EINVAL); - goto unlock; - } - - reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); - if (!reporter) { - reporter = ERR_PTR(-ENOMEM); + reporter = __devlink_health_reporter_create(devlink, ops, + graceful_period, priv); + if (IS_ERR(reporter)) goto unlock; - } - reporter->priv = priv; - reporter->ops = ops; - reporter->devlink = devlink; - reporter->graceful_period = graceful_period; - reporter->auto_recover = !!ops->recover; - reporter->auto_dump = !!ops->dump; - mutex_init(&reporter->dump_lock); - refcount_set(&reporter->refcount, 1); list_add_tail(&reporter->list, &devlink->reporter_list); unlock: mutex_unlock(&devlink->reporters_lock); @@ -5222,6 +5468,29 @@ unlock: } EXPORT_SYMBOL_GPL(devlink_health_reporter_create); +static void +devlink_health_reporter_free(struct devlink_health_reporter *reporter) +{ + mutex_destroy(&reporter->dump_lock); + if (reporter->dump_fmsg) + devlink_fmsg_free(reporter->dump_fmsg); + kfree(reporter); +} + +static void +devlink_health_reporter_put(struct devlink_health_reporter *reporter) +{ + if (refcount_dec_and_test(&reporter->refcount)) + devlink_health_reporter_free(reporter); +} + +static void +__devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) +{ + list_del(&reporter->list); + devlink_health_reporter_put(reporter); +} + /** * devlink_health_reporter_destroy - destroy devlink health reporter * @@ -5230,18 +5499,30 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_create); void devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) { - mutex_lock(&reporter->devlink->reporters_lock); - list_del(&reporter->list); - mutex_unlock(&reporter->devlink->reporters_lock); - while (refcount_read(&reporter->refcount) > 1) - msleep(100); - mutex_destroy(&reporter->dump_lock); - if (reporter->dump_fmsg) - devlink_fmsg_free(reporter->dump_fmsg); - kfree(reporter); + struct mutex *lock = &reporter->devlink->reporters_lock; + + mutex_lock(lock); + __devlink_health_reporter_destroy(reporter); + mutex_unlock(lock); } EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); +/** + * devlink_port_health_reporter_destroy - destroy devlink port health reporter + * + * @reporter: devlink health reporter to destroy + */ +void +devlink_port_health_reporter_destroy(struct devlink_health_reporter *reporter) +{ + struct mutex *lock = &reporter->devlink_port->reporters_lock; + + mutex_lock(lock); + __devlink_health_reporter_destroy(reporter); + mutex_unlock(lock); +} +EXPORT_SYMBOL_GPL(devlink_port_health_reporter_destroy); + static int devlink_nl_health_reporter_fill(struct sk_buff *msg, struct devlink *devlink, @@ -5259,6 +5540,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; + if (reporter->devlink_port) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index)) + goto genlmsg_cancel; + } reporter_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_HEALTH_REPORTER); if (!reporter_attr) @@ -5466,17 +5751,28 @@ devlink_health_reporter_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { struct devlink_health_reporter *reporter; + struct devlink_port *devlink_port; char *reporter_name; if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) return NULL; reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); - mutex_lock(&devlink->reporters_lock); - reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); - if (reporter) - refcount_inc(&reporter->refcount); - mutex_unlock(&devlink->reporters_lock); + devlink_port = devlink_port_get_from_attrs(devlink, attrs); + if (IS_ERR(devlink_port)) { + mutex_lock(&devlink->reporters_lock); + reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); + if (reporter) + refcount_inc(&reporter->refcount); + mutex_unlock(&devlink->reporters_lock); + } else { + mutex_lock(&devlink_port->reporters_lock); + reporter = devlink_port_health_reporter_find_by_name(devlink_port, reporter_name); + if (reporter) + refcount_inc(&reporter->refcount); + mutex_unlock(&devlink_port->reporters_lock); + } + return reporter; } @@ -5508,12 +5804,6 @@ unlock: return NULL; } -static void -devlink_health_reporter_put(struct devlink_health_reporter *reporter) -{ - refcount_dec(&reporter->refcount); -} - void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state) @@ -5570,6 +5860,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct devlink_health_reporter *reporter; + struct devlink_port *port; struct devlink *devlink; int start = cb->args[0]; int idx = 0; @@ -5600,6 +5891,31 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, } mutex_unlock(&devlink->reporters_lock); } + + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + list_for_each_entry(port, &devlink->port_list, list) { + mutex_lock(&port->reporters_lock); + list_for_each_entry(reporter, &port->reporter_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_health_reporter_fill(msg, devlink, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&port->reporters_lock); + goto out; + } + idx++; + } + mutex_unlock(&port->reporters_lock); + } + } out: mutex_unlock(&devlink_mutex); @@ -6107,7 +6423,7 @@ static int __devlink_trap_action_set(struct devlink *devlink, } err = devlink->ops->trap_action_set(devlink, trap_item->trap, - trap_action); + trap_action, extack); if (err) return err; @@ -6397,7 +6713,8 @@ static int devlink_trap_group_set(struct devlink *devlink, } policer = policer_item ? policer_item->policer : NULL; - err = devlink->ops->trap_group_set(devlink, group_item->group, policer); + err = devlink->ops->trap_group_set(devlink, group_item->group, policer, + extack); if (err) return err; @@ -6721,6 +7038,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 }, [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, + [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -6729,7 +7047,6 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_get_doit, .dumpit = devlink_nl_cmd_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { @@ -6752,24 +7069,20 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_split_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PORT_UNSPLIT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_unsplit_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_get_doit, .dumpit = devlink_nl_cmd_sb_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB, /* can be retrieved by unprivileged users */ }, { @@ -6777,8 +7090,6 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_pool_get_doit, .dumpit = devlink_nl_cmd_sb_pool_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB, /* can be retrieved by unprivileged users */ }, { @@ -6786,16 +7097,13 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_pool_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_port_pool_get_doit, .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | - DEVLINK_NL_FLAG_NEED_SB, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, { @@ -6803,16 +7111,14 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_port_pool_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | - DEVLINK_NL_FLAG_NEED_SB, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit, .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | - DEVLINK_NL_FLAG_NEED_SB, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, { @@ -6820,60 +7126,50 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | - DEVLINK_NL_FLAG_NEED_SB, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_occ_snapshot_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_occ_max_clear_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_ESWITCH_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_get_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_ESWITCH_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_table_get, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_entries_get, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_headers_get, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { @@ -6881,20 +7177,17 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_table_counters_set, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_RESOURCE_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_resource_set, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_RESOURCE_DUMP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_resource_dump, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { @@ -6902,15 +7195,13 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_reload, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PARAM_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_param_get_doit, .dumpit = devlink_nl_cmd_param_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { @@ -6918,7 +7209,6 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_param_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_PORT_PARAM_GET, @@ -6941,21 +7231,18 @@ static const struct genl_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_region_get_doit, .dumpit = devlink_nl_cmd_region_get_dumpit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_REGION_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_new, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_REGION_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_del, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_REGION_READ, @@ -6963,14 +7250,12 @@ static const struct genl_ops devlink_nl_ops[] = { GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_region_read_dumpit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_INFO_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_info_get_doit, .dumpit = devlink_nl_cmd_info_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { @@ -6978,7 +7263,7 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_get_doit, .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, /* can be retrieved by unprivileged users */ }, @@ -6987,7 +7272,7 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, { @@ -6995,7 +7280,7 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_recover_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, { @@ -7003,7 +7288,7 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_diagnose_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, { @@ -7012,7 +7297,7 @@ static const struct genl_ops devlink_nl_ops[] = { GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, { @@ -7020,7 +7305,7 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, { @@ -7028,46 +7313,39 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_flash_update, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_TRAP_GET, .doit = devlink_nl_cmd_trap_get_doit, .dumpit = devlink_nl_cmd_trap_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_TRAP_SET, .doit = devlink_nl_cmd_trap_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_TRAP_GROUP_GET, .doit = devlink_nl_cmd_trap_group_get_doit, .dumpit = devlink_nl_cmd_trap_group_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_TRAP_GROUP_SET, .doit = devlink_nl_cmd_trap_group_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_TRAP_POLICER_GET, .doit = devlink_nl_cmd_trap_policer_get_doit, .dumpit = devlink_nl_cmd_trap_policer_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_TRAP_POLICER_SET, .doit = devlink_nl_cmd_trap_policer_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, }; @@ -7132,9 +7410,9 @@ EXPORT_SYMBOL_GPL(devlink_alloc); */ int devlink_register(struct devlink *devlink, struct device *dev) { - mutex_lock(&devlink_mutex); devlink->dev = dev; devlink->registered = true; + mutex_lock(&devlink_mutex); list_add_tail(&devlink->list, &devlink_list); devlink_notify(devlink, DEVLINK_CMD_NEW); mutex_unlock(&devlink_mutex); @@ -7280,6 +7558,8 @@ int devlink_port_register(struct devlink *devlink, list_add_tail(&devlink_port->list, &devlink->port_list); INIT_LIST_HEAD(&devlink_port->param_list); mutex_unlock(&devlink->lock); + INIT_LIST_HEAD(&devlink_port->reporter_list); + mutex_init(&devlink_port->reporters_lock); INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); @@ -7296,6 +7576,8 @@ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; + WARN_ON(!list_empty(&devlink_port->reporter_list)); + mutex_destroy(&devlink_port->reporters_lock); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); mutex_lock(&devlink->lock); @@ -7389,24 +7671,20 @@ void devlink_port_type_clear(struct devlink_port *devlink_port) EXPORT_SYMBOL_GPL(devlink_port_type_clear); static int __devlink_port_attrs_set(struct devlink_port *devlink_port, - enum devlink_port_flavour flavour, - const unsigned char *switch_id, - unsigned char switch_id_len) + enum devlink_port_flavour flavour) { struct devlink_port_attrs *attrs = &devlink_port->attrs; if (WARN_ON(devlink_port->registered)) return -EEXIST; - attrs->set = true; + devlink_port->attrs_set = true; attrs->flavour = flavour; - if (switch_id) { - attrs->switch_port = true; - if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN)) - switch_id_len = MAX_PHYS_ITEM_ID_LEN; - memcpy(attrs->switch_id.id, switch_id, switch_id_len); - attrs->switch_id.id_len = switch_id_len; + if (attrs->switch_id.id_len) { + devlink_port->switch_port = true; + if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN)) + attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN; } else { - attrs->switch_port = false; + devlink_port->switch_port = false; } return 0; } @@ -7415,33 +7693,18 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, * devlink_port_attrs_set - Set port attributes * * @devlink_port: devlink port - * @flavour: flavour of the port - * @port_number: number of the port that is facing user, for example - * the front panel port number - * @split: indicates if this is split port - * @split_subport_number: if the port is split, this is the number - * of subport. - * @switch_id: if the port is part of switch, this is buffer with ID, - * otwerwise this is NULL - * @switch_id_len: length of the switch_id buffer + * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, - enum devlink_port_flavour flavour, - u32 port_number, bool split, - u32 split_subport_number, - const unsigned char *switch_id, - unsigned char switch_id_len) + struct devlink_port_attrs *attrs) { - struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - ret = __devlink_port_attrs_set(devlink_port, flavour, - switch_id, switch_id_len); + devlink_port->attrs = *attrs; + ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) return; - attrs->split = split; - attrs->phys.port_number = port_number; - attrs->phys.split_subport_number = split_subport_number; + WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); @@ -7450,20 +7713,14 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set); * * @devlink_port: devlink port * @pf: associated PF for the devlink port instance - * @switch_id: if the port is part of switch, this is buffer with ID, - * otherwise this is NULL - * @switch_id_len: length of the switch_id buffer */ -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, - const unsigned char *switch_id, - unsigned char switch_id_len, u16 pf) +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_PF, - switch_id, switch_id_len); + DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; @@ -7477,21 +7734,15 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); * @devlink_port: devlink port * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance - * @switch_id: if the port is part of switch, this is buffer with ID, - * otherwise this is NULL - * @switch_id_len: length of the switch_id buffer */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, - const unsigned char *switch_id, - unsigned char switch_id_len, u16 pf, u16 vf) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_VF, - switch_id, switch_id_len); + DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; attrs->pci_vf.pf = pf; @@ -7505,7 +7756,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, struct devlink_port_attrs *attrs = &devlink_port->attrs; int n = 0; - if (!attrs->set) + if (!devlink_port->attrs_set) return -EOPNOTSUPP; switch (attrs->flavour) { @@ -8551,6 +8802,7 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(PTP_GENERAL, CONTROL), DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), + DEVLINK_TRAP(EARLY_DROP, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8800,7 +9052,8 @@ static void devlink_trap_disable(struct devlink *devlink, if (WARN_ON_ONCE(!trap_item)) return; - devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP); + devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP, + NULL); trap_item->action = DEVLINK_TRAP_ACTION_DROP; } @@ -9341,7 +9594,7 @@ int devlink_compat_switch_id_get(struct net_device *dev, * any devlink lock as only permanent values are accessed. */ devlink_port = netdev_to_devlink_port(dev); - if (!devlink_port || !devlink_port->attrs.switch_port) + if (!devlink_port || !devlink_port->switch_port) return -EOPNOTSUPP; memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid)); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index bd7eba9066f8..51678a528f85 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -14,6 +14,20 @@ #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> +#include <linux/indirect_call_wrapper.h> + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES +#ifdef CONFIG_IP_MULTIPLE_TABLES +#define INDIRECT_CALL_MT(f, f2, f1, ...) \ + INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) +#endif +#elif defined(CONFIG_IP_MULTIPLE_TABLES) +#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) +#endif static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(0), @@ -267,7 +281,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, uid_gt(fl->flowi_uid, rule->uid_range.end)) goto out; - ret = ops->match(rule, fl, flags); + ret = INDIRECT_CALL_MT(ops->match, + fib6_rule_match, + fib4_rule_match, + rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } @@ -298,9 +315,15 @@ jumped: } else if (rule->action == FR_ACT_NOP) continue; else - err = ops->action(rule, fl, flags, arg); - - if (!err && ops->suppress && ops->suppress(rule, arg)) + err = INDIRECT_CALL_MT(ops->action, + fib6_rule_action, + fib4_rule_action, + rule, fl, flags, arg); + + if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, + fib6_rule_suppress, + fib4_rule_suppress, + rule, arg)) continue; if (err != -EAGAIN) { diff --git a/net/core/filter.c b/net/core/filter.c index 82e1b5b06167..7124f0fe6974 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -47,6 +47,7 @@ #include <linux/seccomp.h> #include <linux/if_vlan.h> #include <linux/bpf.h> +#include <linux/btf.h> #include <net/sch_generic.h> #include <net/cls_cgroup.h> #include <net/dst_metadata.h> @@ -73,6 +74,31 @@ #include <net/lwtunnel.h> #include <net/ipv6_stubs.h> #include <net/bpf_sk_storage.h> +#include <net/transp_v6.h> +#include <linux/btf_ids.h> + +int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) +{ + if (in_compat_syscall()) { + struct compat_sock_fprog f32; + + if (len != sizeof(f32)) + return -EINVAL; + if (copy_from_sockptr(&f32, src, sizeof(f32))) + return -EFAULT; + memset(dst, 0, sizeof(*dst)); + dst->len = f32.len; + dst->filter = compat_ptr(f32.filter); + } else { + if (len != sizeof(*dst)) + return -EINVAL; + if (copy_from_sockptr(dst, src, sizeof(*dst))) + return -EFAULT; + } + + return 0; +} +EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3777,7 +3803,9 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static int bpf_skb_output_btf_ids[5]; +BTF_ID_LIST(bpf_skb_output_btf_ids) +BTF_ID(struct, sk_buff) + const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, @@ -4171,7 +4199,9 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static int bpf_xdp_output_btf_ids[5]; +BTF_ID_LIST(bpf_xdp_output_btf_ids) +BTF_ID(struct, xdp_buff) + const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, @@ -4289,10 +4319,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen, u32 flags) { char devname[IFNAMSIZ]; + int val, valbool; struct net *net; int ifindex; int ret = 0; - int val; if (!sk_fullsock(sk)) return -EINVAL; @@ -4303,6 +4333,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) return -EINVAL; val = *((int *)optval); + valbool = val ? 1 : 0; /* Only some socketops are supported */ switch (optname) { @@ -4361,6 +4392,11 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, } ret = sock_bindtoindex(sk, ifindex, false); break; + case SO_KEEPALIVE: + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, valbool); + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; default: ret = -EINVAL; } @@ -4421,6 +4457,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, ret = tcp_set_congestion_control(sk, name, false, reinit, true); } else { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if (optlen != sizeof(int)) @@ -4449,6 +4486,33 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, else tp->save_syn = val; break; + case TCP_KEEPIDLE: + ret = tcp_sock_set_keepidle_locked(sk, val); + break; + case TCP_KEEPINTVL: + if (val < 1 || val > MAX_TCP_KEEPINTVL) + ret = -EINVAL; + else + tp->keepalive_intvl = val * HZ; + break; + case TCP_KEEPCNT: + if (val < 1 || val > MAX_TCP_KEEPCNT) + ret = -EINVAL; + else + tp->keepalive_probes = val; + break; + case TCP_SYNCNT: + if (val < 1 || val > MAX_TCP_SYNCNT) + ret = -EINVAL; + else + icsk->icsk_syn_retries = val; + break; + case TCP_USER_TIMEOUT: + if (val < 0) + ret = -EINVAL; + else + icsk->icsk_user_timeout = val; + break; default: ret = -EINVAL; } @@ -6123,6 +6187,7 @@ bool bpf_helper_changes_pkt_data(void *func) } const struct bpf_func_proto bpf_event_output_data_proto __weak; +const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -6155,6 +6220,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_cg_sock_proto; default: return bpf_base_func_proto(func_id); } @@ -6858,6 +6925,7 @@ static bool __sock_filter_check_attach_type(int off, case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: goto full_access; default: return false; @@ -9187,6 +9255,189 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; + +DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); +EXPORT_SYMBOL(bpf_sk_lookup_enabled); + +BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, + struct sock *, sk, u64, flags) +{ + if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | + BPF_SK_LOOKUP_F_NO_REUSEPORT))) + return -EINVAL; + if (unlikely(sk && sk_is_refcounted(sk))) + return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ + if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED)) + return -ESOCKTNOSUPPORT; /* reject connected sockets */ + + /* Check if socket is suitable for packet L3/L4 protocol */ + if (sk && sk->sk_protocol != ctx->protocol) + return -EPROTOTYPE; + if (sk && sk->sk_family != ctx->family && + (sk->sk_family == AF_INET || ipv6_only_sock(sk))) + return -EAFNOSUPPORT; + + if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) + return -EEXIST; + + /* Select socket as lookup result */ + ctx->selected_sk = sk; + ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; + return 0; +} + +static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { + .func = bpf_sk_lookup_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_lookup_assign_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool sk_lookup_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) + return false; + if (off % size != 0) + return false; + if (type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sk_lookup, sk): + info->reg_type = PTR_TO_SOCKET_OR_NULL; + return size == sizeof(__u64); + + case bpf_ctx_range(struct bpf_sk_lookup, family): + case bpf_ctx_range(struct bpf_sk_lookup, protocol): + case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): + case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): + case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + case bpf_ctx_range(struct bpf_sk_lookup, local_port): + bpf_ctx_record_field_size(info, sizeof(__u32)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + + default: + return false; + } +} + +static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sk_lookup, sk): + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, selected_sk)); + break; + + case offsetof(struct bpf_sk_lookup, family): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + family, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + protocol, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, remote_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.saddr, 4, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.daddr, 4, target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sk_lookup, + remote_ip6[0], remote_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.saddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case bpf_ctx_range_till(struct bpf_sk_lookup, + local_ip6[0], local_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.daddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case offsetof(struct bpf_sk_lookup, remote_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + sport, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + dport, 2, target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_prog_ops sk_lookup_prog_ops = { +}; + +const struct bpf_verifier_ops sk_lookup_verifier_ops = { + .get_func_proto = sk_lookup_func_proto, + .is_valid_access = sk_lookup_is_valid_access, + .convert_ctx_access = sk_lookup_convert_ctx_access, +}; + #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) @@ -9195,3 +9446,132 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } + +#ifdef CONFIG_DEBUG_INFO_BTF +BTF_ID_LIST_GLOBAL(btf_sock_ids) +#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE +#else +u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; +#endif + +static bool check_arg_btf_id(u32 btf_id, u32 arg) +{ + int i; + + /* only one argument, no need to check arg */ + for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) + if (btf_sock_ids[i] == btf_id) + return true; + return false; +} + +BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) +{ + /* tcp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct tcp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && + sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { + .func = bpf_skc_to_tcp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], +}; + +BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) +{ + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { + .func = bpf_skc_to_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +}; + +BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) +{ +#ifdef CONFIG_INET + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { + .func = bpf_skc_to_tcp_timewait_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], +}; + +BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) +{ +#ifdef CONFIG_INET + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { + .func = bpf_skc_to_tcp_request_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], +}; + +BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) +{ + /* udp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct udp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && + sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { + .func = bpf_skc_to_udp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], +}; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 142a8824f0a8..29806eb765cf 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -383,6 +383,23 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); +void skb_flow_dissect_hash(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_hash *key; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH)) + return; + + key = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_HASH, + target_container); + + key->hash = skb_get_hash_raw(skb); +} +EXPORT_SYMBOL(skb_flow_dissect_hash); + static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 2076219b8ba5..d4474c812b64 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -430,7 +430,7 @@ EXPORT_SYMBOL(flow_indr_dev_unregister); static void flow_block_indr_init(struct flow_block_cb *flow_block, struct flow_block_offload *bo, - struct net_device *dev, void *data, + struct net_device *dev, struct Qdisc *sch, void *data, void *cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)) { @@ -438,6 +438,7 @@ static void flow_block_indr_init(struct flow_block_cb *flow_block, flow_block->indr.data = data; flow_block->indr.cb_priv = cb_priv; flow_block->indr.dev = dev; + flow_block->indr.sch = sch; flow_block->indr.cleanup = cleanup; } @@ -445,7 +446,8 @@ struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv), struct flow_block_offload *bo, - struct net_device *dev, void *data, + struct net_device *dev, + struct Qdisc *sch, void *data, void *indr_cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)) { @@ -455,7 +457,7 @@ struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, if (IS_ERR(block_cb)) goto out; - flow_block_indr_init(block_cb, bo, dev, data, indr_cb_priv, cleanup); + flow_block_indr_init(block_cb, bo, dev, sch, data, indr_cb_priv, cleanup); list_add(&block_cb->indr.list, &flow_block_indr_list); out: @@ -463,7 +465,7 @@ out: } EXPORT_SYMBOL(flow_indr_block_cb_alloc); -int flow_indr_dev_setup_offload(struct net_device *dev, +int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)) @@ -472,7 +474,7 @@ int flow_indr_dev_setup_offload(struct net_device *dev, mutex_lock(&flow_indr_block_lock); list_for_each_entry(this, &flow_block_indr_dev_list, list) - this->cb(dev, this->cb_priv, type, bo, data, cleanup); + this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); mutex_unlock(&flow_indr_block_lock); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ef6b5a8f629c..8e39e28b0a8d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1783,6 +1783,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, + [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 85a4b0101f76..68e0682450c6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1000,6 +1000,16 @@ static size_t rtnl_prop_list_size(const struct net_device *dev) return size; } +static size_t rtnl_proto_down_size(const struct net_device *dev) +{ + size_t size = nla_total_size(1); + + if (dev->proto_down_reason) + size += nla_total_size(0) + nla_total_size(4); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1041,7 +1051,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ - + nla_total_size(1) /* IFLA_PROTO_DOWN */ + + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ @@ -1416,13 +1426,12 @@ static u32 rtnl_xdp_prog_skb(struct net_device *dev) static u32 rtnl_xdp_prog_drv(struct net_device *dev) { - return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG); + return dev_xdp_prog_id(dev, XDP_MODE_DRV); } static u32 rtnl_xdp_prog_hw(struct net_device *dev) { - return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, - XDP_QUERY_PROG_HW); + return dev_xdp_prog_id(dev, XDP_MODE_HW); } static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, @@ -1658,6 +1667,35 @@ nest_cancel: return ret; } +static int rtnl_fill_proto_down(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *pr; + u32 preason; + + if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + goto nla_put_failure; + + preason = dev->proto_down_reason; + if (!preason) + return 0; + + pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); + if (!pr) + return -EMSGSIZE; + + if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { + nla_nest_cancel(skb, pr); + goto nla_put_failure; + } + + nla_nest_end(skb, pr); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1708,13 +1746,15 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || - nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; + if (rtnl_fill_proto_down(skb, dev)) + goto nla_put_failure; + if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; @@ -1834,6 +1874,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, + [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2483,6 +2524,67 @@ static int do_set_master(struct net_device *dev, int ifindex, return 0; } +static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { + [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, + [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, +}; + +static int do_set_proto_down(struct net_device *dev, + struct nlattr *nl_proto_down, + struct nlattr *nl_proto_down_reason, + struct netlink_ext_ack *extack) +{ + struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; + const struct net_device_ops *ops = dev->netdev_ops; + unsigned long mask = 0; + u32 value; + bool proto_down; + int err; + + if (!ops->ndo_change_proto_down) { + NL_SET_ERR_MSG(extack, "Protodown not supported by device"); + return -EOPNOTSUPP; + } + + if (nl_proto_down_reason) { + err = nla_parse_nested_deprecated(pdreason, + IFLA_PROTO_DOWN_REASON_MAX, + nl_proto_down_reason, + ifla_proto_down_reason_policy, + NULL); + if (err < 0) + return err; + + if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { + NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); + return -EINVAL; + } + + value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); + + if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) + mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); + + dev_change_proto_down_reason(dev, mask, value); + } + + if (nl_proto_down) { + proto_down = nla_get_u8(nl_proto_down); + + /* Dont turn off protodown if there are active reasons */ + if (!proto_down && dev->proto_down_reason) { + NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); + return -EBUSY; + } + err = dev_change_proto_down(dev, + proto_down); + if (err) + return err; + } + + return 0; +} + #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 @@ -2771,9 +2873,9 @@ static int do_setlink(const struct sk_buff *skb, } err = 0; - if (tb[IFLA_PROTO_DOWN]) { - err = dev_change_proto_down(dev, - nla_get_u8(tb[IFLA_PROTO_DOWN])); + if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { + err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], + tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b8afefe6f6b6..2828f6d5ba89 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3758,7 +3758,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int err = -ENOMEM; int i = 0; int pos; - int dummy; if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { @@ -3780,7 +3779,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, } __skb_push(head_skb, doffset); - proto = skb_network_protocol(head_skb, &dummy); + proto = skb_network_protocol(head_skb, NULL); if (unlikely(!proto)) return ERR_PTR(-EINVAL); @@ -4413,7 +4412,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) * at the moment even if they are anonymous). */ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && - __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) + !__pskb_pull_tail(skb, __skb_pagelen(skb))) return -ENOMEM; /* Easy case. Most of packets will go this way. */ @@ -4692,7 +4691,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - skb = tcp_get_timestamping_opt_stats(sk); + skb = tcp_get_timestamping_opt_stats(sk, orig_skb); opt_stats = true; } else #endif diff --git a/net/core/sock.c b/net/core/sock.c index 8ccdcdaaa673..49cd5ffe673e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -113,6 +113,7 @@ #include <linux/static_key.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> +#include <linux/compat.h> #include <linux/uaccess.h> @@ -360,7 +361,8 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval) return sizeof(tv); } -static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval) +static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, + bool old_timeval) { struct __kernel_sock_timeval tv; @@ -370,7 +372,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool if (optlen < sizeof(tv32)) return -EINVAL; - if (copy_from_user(&tv32, optval, sizeof(tv32))) + if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) return -EFAULT; tv.tv_sec = tv32.tv_sec; tv.tv_usec = tv32.tv_usec; @@ -379,14 +381,14 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool if (optlen < sizeof(old_tv)) return -EINVAL; - if (copy_from_user(&old_tv, optval, sizeof(old_tv))) + if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) return -EFAULT; tv.tv_sec = old_tv.tv_sec; tv.tv_usec = old_tv.tv_usec; } else { if (optlen < sizeof(tv)) return -EINVAL; - if (copy_from_user(&tv, optval, sizeof(tv))) + if (copy_from_sockptr(&tv, optval, sizeof(tv))) return -EFAULT; } if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) @@ -608,8 +610,7 @@ int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) } EXPORT_SYMBOL(sock_bindtoindex); -static int sock_setbindtodevice(struct sock *sk, char __user *optval, - int optlen) +static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -631,7 +632,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, memset(devname, 0, sizeof(devname)); ret = -EFAULT; - if (copy_from_user(devname, optval, optlen)) + if (copy_from_sockptr(devname, optval, optlen)) goto out; index = 0; @@ -695,15 +696,6 @@ out: return ret; } -static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, - int valbool) -{ - if (valbool) - sock_set_flag(sk, bit); - else - sock_reset_flag(sk, bit); -} - bool sk_mc_loop(struct sock *sk) { if (dev_recursion_level()) @@ -834,7 +826,7 @@ EXPORT_SYMBOL(sock_set_rcvbuf); */ int sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock_txtime sk_txtime; struct sock *sk = sock->sk; @@ -853,7 +845,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; @@ -966,7 +958,7 @@ set_sndbuf: ret = -EINVAL; /* 1003.1g */ break; } - if (copy_from_user(&ling, optval, sizeof(ling))) { + if (copy_from_sockptr(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } @@ -1060,60 +1052,52 @@ set_sndbuf: case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, + optlen, optname == SO_RCVTIMEO_OLD); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, + optlen, optname == SO_SNDTIMEO_OLD); break; - case SO_ATTACH_FILTER: - ret = -EINVAL; - if (optlen == sizeof(struct sock_fprog)) { - struct sock_fprog fprog; - - ret = -EFAULT; - if (copy_from_user(&fprog, optval, sizeof(fprog))) - break; + case SO_ATTACH_FILTER: { + struct sock_fprog fprog; + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); + if (!ret) ret = sk_attach_filter(&fprog, sk); - } break; - + } case SO_ATTACH_BPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; - if (copy_from_user(&ufd, optval, sizeof(ufd))) + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_attach_bpf(ufd, sk); } break; - case SO_ATTACH_REUSEPORT_CBPF: - ret = -EINVAL; - if (optlen == sizeof(struct sock_fprog)) { - struct sock_fprog fprog; - - ret = -EFAULT; - if (copy_from_user(&fprog, optval, sizeof(fprog))) - break; + case SO_ATTACH_REUSEPORT_CBPF: { + struct sock_fprog fprog; + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); + if (!ret) ret = sk_reuseport_attach_filter(&fprog, sk); - } break; - + } case SO_ATTACH_REUSEPORT_EBPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; - if (copy_from_user(&ufd, optval, sizeof(ufd))) + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_reuseport_attach_bpf(ufd, sk); @@ -1193,7 +1177,7 @@ set_sndbuf: if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && - get_user(ulval, (unsigned long __user *)optval)) { + copy_from_sockptr(&ulval, optval, sizeof(ulval))) { ret = -EFAULT; break; } @@ -1236,7 +1220,7 @@ set_sndbuf: if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; break; - } else if (copy_from_user(&sk_txtime, optval, + } else if (copy_from_sockptr(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; break; @@ -2802,20 +2786,6 @@ int sock_no_shutdown(struct socket *sock, int how) } EXPORT_SYMBOL(sock_no_shutdown); -int sock_no_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - return -EOPNOTSUPP; -} -EXPORT_SYMBOL(sock_no_setsockopt); - -int sock_no_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - return -EOPNOTSUPP; -} -EXPORT_SYMBOL(sock_no_getsockopt); - int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; @@ -3243,20 +3213,6 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, } EXPORT_SYMBOL(sock_common_getsockopt); -#ifdef CONFIG_COMPAT -int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct sock *sk = sock->sk; - - if (sk->sk_prot->compat_getsockopt != NULL) - return sk->sk_prot->compat_getsockopt(sk, level, optname, - optval, optlen); - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_sock_common_getsockopt); -#endif - int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -3276,7 +3232,7 @@ EXPORT_SYMBOL(sock_common_recvmsg); * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; @@ -3284,20 +3240,6 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, } EXPORT_SYMBOL(sock_common_setsockopt); -#ifdef CONFIG_COMPAT -int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - struct sock *sk = sock->sk; - - if (sk->sk_prot->compat_setsockopt != NULL) - return sk->sk_prot->compat_setsockopt(sk, level, optname, - optval, optlen); - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_sock_common_setsockopt); -#endif - void sk_common_release(struct sock *sk) { if (sk->sk_prot->destroy) @@ -3596,6 +3538,7 @@ int sock_load_diag_module(int family, int protocol) #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && + protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 0971f17e8e54..119f52a99dc1 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -681,6 +681,7 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +static int sock_map_btf_id; const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -691,9 +692,11 @@ const struct bpf_map_ops sock_map_ops = { .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_stab", + .map_btf_id = &sock_map_btf_id, }; -struct bpf_htab_elem { +struct bpf_shtab_elem { struct rcu_head rcu; u32 hash; struct sock *sk; @@ -701,14 +704,14 @@ struct bpf_htab_elem { u8 key[]; }; -struct bpf_htab_bucket { +struct bpf_shtab_bucket { struct hlist_head head; raw_spinlock_t lock; }; -struct bpf_htab { +struct bpf_shtab { struct bpf_map map; - struct bpf_htab_bucket *buckets; + struct bpf_shtab_bucket *buckets; u32 buckets_num; u32 elem_size; struct sk_psock_progs progs; @@ -720,17 +723,17 @@ static inline u32 sock_hash_bucket_hash(const void *key, u32 len) return jhash(key, len, 0); } -static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab, - u32 hash) +static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, + u32 hash) { return &htab->buckets[hash & (htab->buckets_num - 1)]; } -static struct bpf_htab_elem * +static struct bpf_shtab_elem * sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { - struct bpf_htab_elem *elem; + struct bpf_shtab_elem *elem; hlist_for_each_entry_rcu(elem, head, node) { if (elem->hash == hash && @@ -743,10 +746,10 @@ sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; - struct bpf_htab_bucket *bucket; - struct bpf_htab_elem *elem; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -757,8 +760,8 @@ static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) return elem ? elem->sk : NULL; } -static void sock_hash_free_elem(struct bpf_htab *htab, - struct bpf_htab_elem *elem) +static void sock_hash_free_elem(struct bpf_shtab *htab, + struct bpf_shtab_elem *elem) { atomic_dec(&htab->count); kfree_rcu(elem, rcu); @@ -767,9 +770,9 @@ static void sock_hash_free_elem(struct bpf_htab *htab, static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_htab_elem *elem_probe, *elem = link_raw; - struct bpf_htab_bucket *bucket; + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_elem *elem_probe, *elem = link_raw; + struct bpf_shtab_bucket *bucket; WARN_ON_ONCE(!rcu_read_lock_held()); bucket = sock_hash_select_bucket(htab, elem->hash); @@ -791,10 +794,10 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, static int sock_hash_delete_elem(struct bpf_map *map, void *key) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; - struct bpf_htab_bucket *bucket; - struct bpf_htab_elem *elem; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; int ret = -ENOENT; hash = sock_hash_bucket_hash(key, key_size); @@ -812,12 +815,12 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) return ret; } -static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab, - void *key, u32 key_size, - u32 hash, struct sock *sk, - struct bpf_htab_elem *old) +static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, + void *key, u32 key_size, + u32 hash, struct sock *sk, + struct bpf_shtab_elem *old) { - struct bpf_htab_elem *new; + struct bpf_shtab_elem *new; if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (!old) { @@ -841,10 +844,10 @@ static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab, static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; - struct bpf_htab_elem *elem, *elem_new; - struct bpf_htab_bucket *bucket; + struct bpf_shtab_elem *elem, *elem_new; + struct bpf_shtab_bucket *bucket; struct sk_psock_link *link; struct sk_psock *psock; int ret; @@ -954,8 +957,8 @@ out: static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_htab_elem *elem, *elem_next; + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_elem *elem, *elem_next; u32 hash, key_size = map->key_size; struct hlist_head *head; int i = 0; @@ -969,7 +972,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key, goto find_first_elem; elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)), - struct bpf_htab_elem, node); + struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; @@ -981,7 +984,7 @@ find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), - struct bpf_htab_elem, node); + struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; @@ -993,7 +996,7 @@ find_first_elem: static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { - struct bpf_htab *htab; + struct bpf_shtab *htab; int i, err; u64 cost; @@ -1015,15 +1018,15 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&htab->map, attr); htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); - htab->elem_size = sizeof(struct bpf_htab_elem) + + htab->elem_size = sizeof(struct bpf_shtab_elem) + round_up(htab->map.key_size, 8); if (htab->buckets_num == 0 || - htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) { + htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { err = -EINVAL; goto free_htab; } - cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) + + cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) + (u64) htab->elem_size * htab->map.max_entries; if (cost >= U32_MAX - PAGE_SIZE) { err = -EINVAL; @@ -1034,7 +1037,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) goto free_htab; htab->buckets = bpf_map_area_alloc(htab->buckets_num * - sizeof(struct bpf_htab_bucket), + sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { bpf_map_charge_finish(&htab->map.memory); @@ -1055,10 +1058,10 @@ free_htab: static void sock_hash_free(struct bpf_map *map) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_htab_bucket *bucket; + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_bucket *bucket; struct hlist_head unlink_list; - struct bpf_htab_elem *elem; + struct bpf_shtab_elem *elem; struct hlist_node *node; int i; @@ -1134,7 +1137,7 @@ static void *sock_hash_lookup(struct bpf_map *map, void *key) static void sock_hash_release_progs(struct bpf_map *map) { - psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs); + psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); } BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, @@ -1214,6 +1217,7 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .arg4_type = ARG_ANYTHING, }; +static int sock_hash_map_btf_id; const struct bpf_map_ops sock_hash_ops = { .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, @@ -1224,6 +1228,8 @@ const struct bpf_map_ops sock_hash_ops = { .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_shtab", + .map_btf_id = &sock_hash_map_btf_id, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) @@ -1232,7 +1238,7 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) case BPF_MAP_TYPE_SOCKMAP: return &container_of(map, struct bpf_stab, map)->progs; case BPF_MAP_TYPE_SOCKHASH: - return &container_of(map, struct bpf_htab, map)->progs; + return &container_of(map, struct bpf_shtab, map)->progs; default: break; } diff --git a/net/core/tso.c b/net/core/tso.c index d4d5c077ad72..4148f6d48953 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -6,18 +6,17 @@ #include <asm/unaligned.h> /* Calculate expected number of TX descriptors */ -int tso_count_descs(struct sk_buff *skb) +int tso_count_descs(const struct sk_buff *skb) { /* The Marvell Way */ return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; } EXPORT_SYMBOL(tso_count_descs); -void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, +void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, int size, bool is_last) { - struct tcphdr *tcph; - int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int hdr_len = skb_transport_offset(skb) + tso->tlen; int mac_hdr_len = skb_network_offset(skb); memcpy(hdr, skb->data, hdr_len); @@ -30,23 +29,31 @@ void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, } else { struct ipv6hdr *iph = (void *)(hdr + mac_hdr_len); - iph->payload_len = htons(size + tcp_hdrlen(skb)); + iph->payload_len = htons(size + tso->tlen); } - tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb)); - put_unaligned_be32(tso->tcp_seq, &tcph->seq); + hdr += skb_transport_offset(skb); + if (tso->tlen != sizeof(struct udphdr)) { + struct tcphdr *tcph = (struct tcphdr *)hdr; - if (!is_last) { - /* Clear all special flags for not last packet */ - tcph->psh = 0; - tcph->fin = 0; - tcph->rst = 0; + put_unaligned_be32(tso->tcp_seq, &tcph->seq); + + if (!is_last) { + /* Clear all special flags for not last packet */ + tcph->psh = 0; + tcph->fin = 0; + tcph->rst = 0; + } + } else { + struct udphdr *uh = (struct udphdr *)hdr; + + uh->len = htons(sizeof(*uh) + size); } } EXPORT_SYMBOL(tso_build_hdr); -void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) +void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size) { - tso->tcp_seq += size; + tso->tcp_seq += size; /* not worth avoiding this operation for UDP */ tso->size -= size; tso->data += size; @@ -62,12 +69,14 @@ void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) } EXPORT_SYMBOL(tso_build_data); -void tso_start(struct sk_buff *skb, struct tso_t *tso) +int tso_start(struct sk_buff *skb, struct tso_t *tso) { - int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int tlen = skb_is_gso_tcp(skb) ? tcp_hdrlen(skb) : sizeof(struct udphdr); + int hdr_len = skb_transport_offset(skb) + tlen; + tso->tlen = tlen; tso->ip_id = ntohs(ip_hdr(skb)->id); - tso->tcp_seq = ntohl(tcp_hdr(skb)->seq); + tso->tcp_seq = (tlen != sizeof(struct udphdr)) ? ntohl(tcp_hdr(skb)->seq) : 0; tso->next_frag_idx = 0; tso->ipv6 = vlan_get_protocol(skb) == htons(ETH_P_IPV6); @@ -83,5 +92,6 @@ void tso_start(struct sk_buff *skb, struct tso_t *tso) tso->data = skb_frag_address(frag); tso->next_frag_idx++; } + return hdr_len; } EXPORT_SYMBOL(tso_start); diff --git a/net/core/xdp.c b/net/core/xdp.c index 3c45f99e26d5..48aba933a5a8 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -400,15 +400,6 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) } EXPORT_SYMBOL_GPL(__xdp_release_frame); -int xdp_attachment_query(struct xdp_attachment_info *info, - struct netdev_bpf *bpf) -{ - bpf->prog_id = info->prog ? info->prog->aux->id : 0; - bpf->prog_flags = info->prog ? info->flags : 0; - return 0; -} -EXPORT_SYMBOL_GPL(xdp_attachment_query); - bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index d2a4553bcf39..84dde5a2066e 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1736,7 +1736,7 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *netdev; struct dcbmsg *dcb = nlmsg_data(nlh); struct nlattr *tb[DCB_ATTR_MAX + 1]; - u32 portid = skb ? NETLINK_CB(skb).portid : 0; + u32 portid = NETLINK_CB(skb).portid; int ret = -EINVAL; struct sk_buff *reply_skb; struct nlmsghdr *reply_nlh = NULL; diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 51ac2631fb48..0c7d2f66ba27 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -5,7 +5,7 @@ menuconfig IP_DCCP help Datagram Congestion Control Protocol (RFC 4340) - From http://www.ietf.org/rfc/rfc4340.txt: + From https://www.ietf.org/rfc/rfc4340.txt: The Datagram Congestion Control Protocol (DCCP) is a transport protocol that implements bidirectional, unicast connections of diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 4d7771f36eff..a3eeb84d16f9 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -26,13 +26,13 @@ config IP_DCCP_CCID3 relatively smooth sending rate is of importance. CCID-3 is further described in RFC 4342, - http://www.ietf.org/rfc/rfc4342.txt + https://www.ietf.org/rfc/rfc4342.txt The TFRC congestion control algorithms were initially described in RFC 5348. This text was extracted from RFC 4340 (sec. 10.2), - http://www.ietf.org/rfc/rfc4340.txt + https://www.ietf.org/rfc/rfc4340.txt If in doubt, say N. diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 9ef9bee9610f..aef72f6a2829 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -7,7 +7,7 @@ * An implementation of the DCCP protocol * * This code has been developed by the University of Waikato WAND - * research group. For further information please see http://www.wand.net.nz/ + * research group. For further information please see https://www.wand.net.nz/ * * This code also uses code from Lulea University, rereleased as GPL by its * authors: diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 081c195e7f7d..02e0fc9f6334 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -6,7 +6,7 @@ * An implementation of the DCCP protocol * * This code has been developed by the University of Waikato WAND - * research group. For further information please see http://www.wand.net.nz/ + * research group. For further information please see https://www.wand.net.nz/ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz * * This code also uses code from Lulea University, rereleased as GPL by its diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 2d41bb036271..af08e2df7108 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -6,7 +6,7 @@ * An implementation of the DCCP protocol * * This code has been developed by the University of Waikato WAND - * research group. For further information please see http://www.wand.net.nz/ + * research group. For further information please see https://www.wand.net.nz/ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz * * This code also uses code from Lulea University, rereleased as GPL by its @@ -365,6 +365,7 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) /** * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against + * @h: The non-empty RX history object */ static inline struct tfrc_rx_hist_entry * tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) @@ -374,6 +375,7 @@ static inline struct tfrc_rx_hist_entry * /** * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry + * @h: The non-empty RX history object */ static inline struct tfrc_rx_hist_entry * tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index a157d874840b..159cc9326eab 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -6,7 +6,7 @@ * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. * * This code has been developed by the University of Waikato WAND - * research group. For further information please see http://www.wand.net.nz/ + * research group. For further information please see https://www.wand.net.nz/ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz * * This code also uses code from Lulea University, rereleased as GPL by its diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 7dce4f6c7025..9cc9d1ee6cdb 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -295,13 +295,7 @@ int dccp_disconnect(struct sock *sk, int flags); int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -#ifdef CONFIG_COMPAT -int compat_dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -int compat_dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -#endif + sockptr_t optval, unsigned int optlen); int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 9c3b5e056234..afc071ea1271 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -165,6 +165,8 @@ static const struct { /** * dccp_feat_index - Hash function to map feature number into array position + * @feat_num: feature to hash, one of %dccp_feature_numbers + * * Returns consecutive array index or -1 if the feature is not understood. */ static int dccp_feat_index(u8 feat_num) @@ -567,6 +569,8 @@ cloning_failed: /** * dccp_feat_valid_nn_length - Enforce length constraints on NN options + * @feat_num: feature to return length of, one of %dccp_feature_numbers + * * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, * incoming options are accepted as long as their values are valid. */ @@ -1429,6 +1433,8 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, /** * dccp_feat_init - Seed feature negotiation with host-specific defaults + * @sk: Socket to initialize. + * * This initialises global defaults, depending on the value of the sysctls. * These can later be overridden by registering changes via setsockopt calls. * The last link in the chain is finalise_settings, to make sure that between diff --git a/net/dccp/input.c b/net/dccp/input.c index 6dce68a55964..bd9cfdb67436 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -715,6 +715,7 @@ EXPORT_SYMBOL_GPL(dccp_rcv_state_process); /** * dccp_sample_rtt - Validate and finalise computation of RTT sample + * @sk: socket structure * @delta: number of microseconds between packet and acknowledgment * * The routine is kept generic to work in different contexts. It should be diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d19557c6d04b..9c28c8251125 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -694,6 +694,8 @@ EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); /** * dccp_invalid_packet - check for malformed packets + * @skb: Packet to validate + * * Implements RFC 4340, 8.5: Step 1: Check header basics * Packets that fail these checks are ignored and do not receive Resets. */ @@ -911,10 +913,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ip_setsockopt, - .compat_getsockopt = compat_ip_getsockopt, -#endif }; static int dccp_v4_init_sock(struct sock *sk) @@ -961,10 +959,6 @@ static struct proto dccp_v4_prot = { .rsk_prot = &dccp_request_sock_ops, .twsk_prot = &dccp_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_dccp_setsockopt, - .compat_getsockopt = compat_dccp_getsockopt, -#endif }; static const struct net_protocol dccp_v4_protocol = { @@ -997,10 +991,6 @@ static const struct proto_ops inet_dccp_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; static struct inet_protosw dccp_v4_protosw = { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 650187d68851..ef4ab28cfde0 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -970,10 +970,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif }; /* @@ -990,10 +986,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif }; /* NOTE: A lot of things set to zero explicitly by call to @@ -1049,10 +1041,6 @@ static struct proto dccp_v6_prot = { .rsk_prot = &dccp6_request_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_dccp_setsockopt, - .compat_getsockopt = compat_dccp_getsockopt, -#endif }; static const struct inet6_protocol dccp_v6_protocol = { @@ -1083,8 +1071,6 @@ static const struct proto_ops inet6_dccp_ops = { .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/dccp/options.c b/net/dccp/options.c index 9fed0ae21e63..51aaba7a5d45 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -43,6 +43,7 @@ u64 dccp_decode_value_var(const u8 *bf, const u8 len) * dccp_parse_options - Parse DCCP options present in @skb * @sk: client|server|listening dccp socket (when @dreq != NULL) * @dreq: request socket to use during connection setup, or NULL + * @skb: frame to parse */ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, struct sk_buff *skb) @@ -471,6 +472,8 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) /** * dccp_insert_option_mandatory - Mandatory option (5.8.2) + * @skb: frame into which to insert option + * * Note that since we are using skb_push, this function needs to be called * _after_ inserting the option it is supposed to influence (stack order). */ @@ -486,6 +489,7 @@ int dccp_insert_option_mandatory(struct sk_buff *skb) /** * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb + * @skb: frame to insert feature negotiation option into * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R * @feat: one out of %dccp_feature_numbers * @val: NN value or SP array (preferred element first) to copy diff --git a/net/dccp/proto.c b/net/dccp/proto.c index c13b6609474b..d148ab1530e5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -375,6 +375,15 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) goto out; switch (cmd) { + case SIOCOUTQ: { + int amount = sk_wmem_alloc_get(sk); + /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and + * always 0, comparably to UDP. + */ + + rc = put_user(amount, (int __user *)arg); + } + break; case SIOCINQ: { struct sk_buff *skb; unsigned long amount = 0; @@ -402,7 +411,7 @@ out: EXPORT_SYMBOL_GPL(dccp_ioctl); static int dccp_setsockopt_service(struct sock *sk, const __be32 service, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_service_list *sl = NULL; @@ -417,9 +426,8 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, return -ENOMEM; sl->dccpsl_nr = optlen / sizeof(u32) - 1; - if (copy_from_user(sl->dccpsl_list, - optval + sizeof(service), - optlen - sizeof(service)) || + if (copy_from_sockptr_offset(sl->dccpsl_list, optval, + sizeof(service), optlen - sizeof(service)) || dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { kfree(sl); return -EFAULT; @@ -473,7 +481,7 @@ static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) } static int dccp_setsockopt_ccid(struct sock *sk, int type, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { u8 *val; int rc = 0; @@ -481,7 +489,7 @@ static int dccp_setsockopt_ccid(struct sock *sk, int type, if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) return -EINVAL; - val = memdup_user(optval, optlen); + val = memdup_sockptr(optval, optlen); if (IS_ERR(val)) return PTR_ERR(val); @@ -498,7 +506,7 @@ static int dccp_setsockopt_ccid(struct sock *sk, int type, } static int do_dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; @@ -520,7 +528,7 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, if (optlen < (int)sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; if (optname == DCCP_SOCKOPT_SERVICE) @@ -563,8 +571,8 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, return err; } -int dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, @@ -575,19 +583,6 @@ int dccp_setsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL_GPL(dccp_setsockopt); -#ifdef CONFIG_COMPAT -int compat_dccp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level != SOL_DCCP) - return inet_csk_compat_setsockopt(sk, level, optname, - optval, optlen); - return do_dccp_setsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(compat_dccp_setsockopt); -#endif - static int dccp_getsockopt_service(struct sock *sk, int len, __be32 __user *optval, int __user *optlen) @@ -696,19 +691,6 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL_GPL(dccp_getsockopt); -#ifdef CONFIG_COMPAT -int compat_dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_DCCP) - return inet_csk_compat_getsockopt(sk, level, optname, - optval, optlen); - return do_dccp_getsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); -#endif - static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) { struct cmsghdr *cmsg; diff --git a/net/dccp/timer.c b/net/dccp/timer.c index c0b3672637c4..0e06dfc32273 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -216,6 +216,8 @@ out: /** * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface + * @data: Socket to act on + * * See the comments above %ccid_dequeueing_decision for supported modes. */ static void dccp_write_xmitlet(unsigned long data) diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 0a46ea3bddd5..3b53d766789d 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -150,7 +150,8 @@ static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; static atomic_long_t decnet_memory_allocated; -static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags); +static int __dn_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen, int flags); static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); static struct hlist_head *dn_find_list(struct sock *sk) @@ -1320,7 +1321,8 @@ out: return err; } -static int dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +static int dn_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err; @@ -1338,7 +1340,8 @@ static int dn_setsockopt(struct socket *sock, int level, int optname, char __use return err; } -static int __dn_setsockopt(struct socket *sock, int level,int optname, char __user *optval, unsigned int optlen, int flags) +static int __dn_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen, int flags) { struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); @@ -1354,13 +1357,13 @@ static int __dn_setsockopt(struct socket *sock, int level,int optname, char __us } u; int err; - if (optlen && !optval) + if (optlen && sockptr_is_null(optval)) return -EINVAL; if (optlen > sizeof(u)) return -EINVAL; - if (copy_from_user(&u, optval, optlen)) + if (copy_from_sockptr(&u, optval, optlen)) return -EFAULT; switch (optname) { @@ -2134,14 +2137,11 @@ static struct sock *dn_socket_get_next(struct seq_file *seq, struct dn_iter_state *state = seq->private; n = sk_next(n); -try_again: - if (n) - goto out; - if (++state->bucket >= DN_SK_HASH_SIZE) - goto out; - n = sk_head(&dn_sk_hash[state->bucket]); - goto try_again; -out: + while (!n) { + if (++state->bucket >= DN_SK_HASH_SIZE) + break; + n = sk_head(&dn_sk_hash[state->bucket]); + } return n; } diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 65abcf1b3210..15d42353f1a3 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -462,7 +462,9 @@ int dn_dev_ioctl(unsigned int cmd, void __user *arg) switch (cmd) { case SIOCGIFADDR: *((__le16 *)sdn->sdn_nodeaddr) = ifa->ifa_local; - goto rarok; + if (copy_to_user(arg, ifr, DN_IFREQ_SIZE)) + ret = -EFAULT; + break; case SIOCSIFADDR: if (!ifa) { @@ -485,10 +487,6 @@ done: rtnl_unlock(); return ret; -rarok: - if (copy_to_user(arg, ifr, DN_IFREQ_SIZE)) - ret = -EFAULT; - goto done; } struct net_device *dn_dev_get_default(void) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 06b9983325cc..4cac31d22a50 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -494,6 +494,8 @@ static int dn_return_long(struct sk_buff *skb) /** * dn_route_rx_packet - Try and find a route for an incoming packet + * @net: The applicable net namespace + * @sk: Socket packet transmitted on * @skb: The packet to find a route for * * Returns: result of input function if route is found, error code otherwise @@ -670,7 +672,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type if (decnet_debug_level & 1) printk(KERN_DEBUG "dn_route_rcv: got 0x%02x from %s [%d %d %d]\n", - (int)flags, (dev) ? dev->name : "???", len, skb->len, + (int)flags, dev->name, len, skb->len, padlen); if (flags & DN_RT_PKT_CNTL) { diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index dc705769acc9..26a9193df783 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -6,7 +6,7 @@ * * DECnet Routing Message Grabulator * - * (C) 2000 ChyGwyn Limited - http://www.chygwyn.com/ + * (C) 2000 ChyGwyn Limited - https://www.chygwyn.com/ * * Author: Steven Whitehouse <steve@chygwyn.com> */ diff --git a/net/devres.c b/net/devres.c index 57a6a88d11f6..1f9be2133787 100644 --- a/net/devres.c +++ b/net/devres.c @@ -39,7 +39,7 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, } EXPORT_SYMBOL(devm_alloc_etherdev_mqs); -static void devm_netdev_release(struct device *dev, void *this) +static void devm_unregister_netdev(struct device *dev, void *this) { struct net_device_devres *res = this; @@ -77,7 +77,7 @@ int devm_register_netdev(struct device *dev, struct net_device *ndev) netdev_devres_match, ndev))) return -EINVAL; - dr = devres_alloc(devm_netdev_release, sizeof(*dr), GFP_KERNEL); + dr = devres_alloc(devm_unregister_netdev, sizeof(*dr), GFP_KERNEL); if (!dr) return -ENOMEM; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index d5bc6ac599ef..1f9b9b11008c 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -86,6 +86,13 @@ config NET_DSA_TAG_KSZ Say Y if you want to enable support for tagging frames for the Microchip 8795/9477/9893 families of switches. +config NET_DSA_TAG_RTL4_A + tristate "Tag driver for Realtek 4 byte protocol A tags" + help + Say Y or M if you want to enable support for tagging frames for the + Realtek switches with 4 byte protocol A tags, sich as found in + the Realtek RTL8366RB. + config NET_DSA_TAG_OCELOT tristate "Tag driver for Ocelot family of switches" select PACKING diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 108486cfdeef..4f47b2025ff5 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o +obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 076908fdd29b..c0ffc7a2b65f 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -261,10 +261,15 @@ static int dsa_port_setup(struct dsa_port *dp) struct devlink_port *dlp = &dp->devlink_port; bool dsa_port_link_registered = false; bool devlink_port_registered = false; + struct devlink_port_attrs attrs = {}; struct devlink *dl = ds->devlink; bool dsa_port_enabled = false; int err = 0; + attrs.phys.port_number = dp->index; + memcpy(attrs.switch_id.id, id, len); + attrs.switch_id.id_len = len; + if (dp->setup) return 0; @@ -274,8 +279,8 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_CPU: memset(dlp, 0, sizeof(*dlp)); - devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_CPU, - dp->index, false, 0, id, len); + attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU; + devlink_port_attrs_set(dlp, &attrs); err = devlink_port_register(dl, dlp, dp->index); if (err) break; @@ -294,8 +299,8 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_DSA: memset(dlp, 0, sizeof(*dlp)); - devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_DSA, - dp->index, false, 0, id, len); + attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA; + devlink_port_attrs_set(dlp, &attrs); err = devlink_port_register(dl, dlp, dp->index); if (err) break; @@ -314,8 +319,8 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_USER: memset(dlp, 0, sizeof(*dlp)); - devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_PHYSICAL, - dp->index, false, 0, id, len); + attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + devlink_port_attrs_set(dlp, &attrs); err = devlink_port_register(dl, dlp, dp->index); if (err) break; @@ -722,8 +727,12 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, ports = of_get_child_by_name(dn, "ports"); if (!ports) { - dev_err(ds->dev, "no ports child node found\n"); - return -EINVAL; + /* The second possibility is "ethernet-ports" */ + ports = of_get_child_by_name(dn, "ethernet-ports"); + if (!ports) { + dev_err(ds->dev, "no ports child node found\n"); + return -EINVAL; + } } for_each_available_child_of_node(ports, port) { diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index adecf73bd608..1653e3377cb3 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -77,7 +77,7 @@ struct dsa_slave_priv { struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); - struct pcpu_sw_netstats *stats64; + struct pcpu_sw_netstats __percpu *stats64; struct gro_cells gcells; diff --git a/net/dsa/master.c b/net/dsa/master.c index 480a61460c23..61615ebc70e9 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -186,17 +186,6 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, } } -static int dsa_master_get_phys_port_name(struct net_device *dev, - char *name, size_t len) -{ - struct dsa_port *cpu_dp = dev->dsa_ptr; - - if (snprintf(name, len, "p%d", cpu_dp->index) >= len) - return -EINVAL; - - return 0; -} - static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_port *cpu_dp = dev->dsa_ptr; @@ -220,12 +209,16 @@ static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } - if (cpu_dp->orig_ndo_ops && cpu_dp->orig_ndo_ops->ndo_do_ioctl) - err = cpu_dp->orig_ndo_ops->ndo_do_ioctl(dev, ifr, cmd); + if (dev->netdev_ops->ndo_do_ioctl) + err = dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd); return err; } +static const struct dsa_netdevice_ops dsa_netdev_ops = { + .ndo_do_ioctl = dsa_master_ioctl, +}; + static int dsa_master_ethtool_setup(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; @@ -260,38 +253,10 @@ static void dsa_master_ethtool_teardown(struct net_device *dev) cpu_dp->orig_ethtool_ops = NULL; } -static int dsa_master_ndo_setup(struct net_device *dev) +static void dsa_netdev_ops_set(struct net_device *dev, + const struct dsa_netdevice_ops *ops) { - struct dsa_port *cpu_dp = dev->dsa_ptr; - struct dsa_switch *ds = cpu_dp->ds; - struct net_device_ops *ops; - - if (dev->netdev_ops->ndo_get_phys_port_name) - return 0; - - ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL); - if (!ops) - return -ENOMEM; - - cpu_dp->orig_ndo_ops = dev->netdev_ops; - if (cpu_dp->orig_ndo_ops) - memcpy(ops, cpu_dp->orig_ndo_ops, sizeof(*ops)); - - ops->ndo_get_phys_port_name = dsa_master_get_phys_port_name; - ops->ndo_do_ioctl = dsa_master_ioctl; - - dev->netdev_ops = ops; - - return 0; -} - -static void dsa_master_ndo_teardown(struct net_device *dev) -{ - struct dsa_port *cpu_dp = dev->dsa_ptr; - - if (cpu_dp->orig_ndo_ops) - dev->netdev_ops = cpu_dp->orig_ndo_ops; - cpu_dp->orig_ndo_ops = NULL; + dev->dsa_ptr->netdev_ops = ops; } static ssize_t tagging_show(struct device *d, struct device_attribute *attr, @@ -353,9 +318,7 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) if (ret) return ret; - ret = dsa_master_ndo_setup(dev); - if (ret) - goto out_err_ethtool_teardown; + dsa_netdev_ops_set(dev, &dsa_netdev_ops); ret = sysfs_create_group(&dev->dev.kobj, &dsa_group); if (ret) @@ -364,8 +327,7 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) return ret; out_err_ndo_teardown: - dsa_master_ndo_teardown(dev); -out_err_ethtool_teardown: + dsa_netdev_ops_set(dev, NULL); dsa_master_ethtool_teardown(dev); return ret; } @@ -373,7 +335,7 @@ out_err_ethtool_teardown: void dsa_master_teardown(struct net_device *dev) { sysfs_remove_group(&dev->dev.kobj, &dsa_group); - dsa_master_ndo_teardown(dev); + dsa_netdev_ops_set(dev, NULL); dsa_master_ethtool_teardown(dev); dsa_master_reset_mtu(dev); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4c7f086a047b..41d60eeefdbd 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1754,11 +1754,8 @@ int dsa_slave_create(struct dsa_port *port) eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; - slave_dev->min_mtu = 0; if (ds->ops->port_max_mtu) slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); - else - slave_dev->max_mtu = ETH_MAX_MTU; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, @@ -1795,7 +1792,8 @@ int dsa_slave_create(struct dsa_port *port) ret = dsa_slave_phy_setup(slave_dev); if (ret) { - netdev_err(master, "error %d setting up slave phy\n", ret); + netdev_err(master, "error %d setting up slave PHY for %s\n", + ret, slave_dev->name); goto out_gcells; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 90d055c4df9e..bd1a3158d79a 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -156,8 +156,9 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, { struct dsa_port *dp = dsa_slave_to_port(dev); struct sk_buff *nskb; - u16 *tag; + __be16 *tag; u8 *addr; + u16 val; nskb = ksz_common_xmit(skb, dev, KSZ9477_INGRESS_TAG_LEN); if (!nskb) @@ -167,12 +168,12 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, tag = skb_put(nskb, KSZ9477_INGRESS_TAG_LEN); addr = skb_mac_header(nskb); - *tag = BIT(dp->index); + val = BIT(dp->index); if (is_link_local_ether_addr(addr)) - *tag |= KSZ9477_TAIL_TAG_OVERRIDE; + val |= KSZ9477_TAIL_TAG_OVERRIDE; - *tag = cpu_to_be16(*tag); + *tag = cpu_to_be16(val); return nskb; } diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index eb0e7a32e53d..ccfb6f641bbf 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -55,7 +55,8 @@ static int lan9303_xmit_use_arl(struct dsa_port *dp, u8 *dest_addr) static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - u16 *lan9303_tag; + __be16 *lan9303_tag; + u16 tag; /* insert a special VLAN tag between the MAC addresses * and the current ethertype field. @@ -72,12 +73,12 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) /* make room between MACs and Ether-Type */ memmove(skb->data, skb->data + LAN9303_TAG_LEN, 2 * ETH_ALEN); - lan9303_tag = (u16 *)(skb->data + 2 * ETH_ALEN); + lan9303_tag = (__be16 *)(skb->data + 2 * ETH_ALEN); + tag = lan9303_xmit_use_arl(dp, skb->data) ? + LAN9303_TAG_TX_USE_ALR : + dp->index | LAN9303_TAG_TX_STP_OVERRIDE; lan9303_tag[0] = htons(ETH_P_8021Q); - lan9303_tag[1] = lan9303_xmit_use_arl(dp, skb->data) ? - LAN9303_TAG_TX_USE_ALR : - dp->index | LAN9303_TAG_TX_STP_OVERRIDE; - lan9303_tag[1] = htons(lan9303_tag[1]); + lan9303_tag[1] = htons(tag); return skb; } @@ -85,7 +86,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - u16 *lan9303_tag; + __be16 *lan9303_tag; u16 lan9303_tag1; unsigned int source_port; @@ -101,7 +102,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, * ^ * ->data */ - lan9303_tag = (u16 *)(skb->data - 2); + lan9303_tag = (__be16 *)(skb->data - 2); if (lan9303_tag[0] != htons(ETH_P_8021Q)) { dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid VLAN marker\n"); diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index d6619edd53e5..f602fc758d68 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -67,8 +67,9 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { + u16 hdr; int port; - __be16 *phdr, hdr; + __be16 *phdr; unsigned char *dest = eth_hdr(skb)->h_dest; bool is_multicast_skb = is_multicast_ether_addr(dest) && !is_broadcast_ether_addr(dest); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index b0c98ee4e13b..42f327c06dca 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -137,11 +137,10 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { struct dsa_port *dp = dsa_slave_to_port(netdev); - u64 bypass, dest, src, qos_class, rew_op; struct dsa_switch *ds = dp->ds; - int port = dp->index; struct ocelot *ocelot = ds->priv; - struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_port *ocelot_port; + u64 qos_class, rew_op; u8 *injection; if (unlikely(skb_cow_head(skb, OCELOT_TAG_LEN) < 0)) { @@ -149,19 +148,15 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, return NULL; } - injection = skb_push(skb, OCELOT_TAG_LEN); + ocelot_port = ocelot->ports[dp->index]; - memset(injection, 0, OCELOT_TAG_LEN); + injection = skb_push(skb, OCELOT_TAG_LEN); - /* Set the source port as the CPU port module and not the NPI port */ - src = ocelot->num_phys_ports; - dest = BIT(port); - bypass = true; + memcpy(injection, ocelot_port->xmit_template, OCELOT_TAG_LEN); + /* Fix up the fields which are not statically determined + * in the template + */ qos_class = skb->priority; - - packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &dest, 68, 56, OCELOT_TAG_LEN, PACK, 0); - packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0); packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 70db7c909f74..7066f5e697d7 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -31,7 +31,8 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - u16 *phdr, hdr; + __be16 *phdr; + u16 hdr; if (skb_cow_head(skb, QCA_HDR_LEN) < 0) return NULL; @@ -39,7 +40,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) skb_push(skb, QCA_HDR_LEN); memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN); - phdr = (u16 *)(skb->data + 2 * ETH_ALEN); + phdr = (__be16 *)(skb->data + 2 * ETH_ALEN); /* Set the version field, and set destination port information */ hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S | @@ -54,8 +55,9 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { u8 ver; + u16 hdr; int port; - __be16 *phdr, hdr; + __be16 *phdr; if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) return NULL; diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c new file mode 100644 index 000000000000..7b63010fa87b --- /dev/null +++ b/net/dsa/tag_rtl4_a.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Handler for Realtek 4 byte DSA switch tags + * Currently only supports protocol "A" found in RTL8366RB + * Copyright (c) 2020 Linus Walleij <linus.walleij@linaro.org> + * + * This "proprietary tag" header looks like so: + * + * ------------------------------------------------- + * | MAC DA | MAC SA | 0x8899 | 2 bytes tag | Type | + * ------------------------------------------------- + * + * The 2 bytes tag form a 16 bit big endian word. The exact + * meaning has been guessed from packet dumps from ingress + * frames, as no working egress traffic has been available + * we do not know the format of the egress tags or if they + * are even supported. + */ + +#include <linux/etherdevice.h> +#include <linux/bits.h> + +#include "dsa_priv.h" + +#define RTL4_A_HDR_LEN 4 +#define RTL4_A_ETHERTYPE 0x8899 +#define RTL4_A_PROTOCOL_SHIFT 12 +/* + * 0x1 = Realtek Remote Control protocol (RRCP) + * 0x2/0x3 seems to be used for loopback testing + * 0x9 = RTL8306 DSA protocol + * 0xa = RTL8366RB DSA protocol + */ +#define RTL4_A_PROTOCOL_RTL8366RB 0xa + +static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* + * Just let it pass thru, we don't know if it is possible + * to tag a frame with the 0x8899 ethertype and direct it + * to a specific port, all attempts at reverse-engineering have + * ended up with the frames getting dropped. + * + * The VLAN set-up needs to restrict the frames to the right port. + * + * If you have documentation on the tagging format for RTL8366RB + * (tag type A) then please contribute. + */ + return skb; +} + +static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt) +{ + u16 protport; + __be16 *p; + u16 etype; + u8 *tag; + u8 prot; + u8 port; + + if (unlikely(!pskb_may_pull(skb, RTL4_A_HDR_LEN))) + return NULL; + + /* The RTL4 header has its own custom Ethertype 0x8899 and that + * starts right at the beginning of the packet, after the src + * ethernet addr. Apparantly skb->data always points 2 bytes in, + * behind the Ethertype. + */ + tag = skb->data - 2; + p = (__be16 *)tag; + etype = ntohs(*p); + if (etype != RTL4_A_ETHERTYPE) { + /* Not custom, just pass through */ + netdev_dbg(dev, "non-realtek ethertype 0x%04x\n", etype); + return skb; + } + p = (__be16 *)(tag + 2); + protport = ntohs(*p); + /* The 4 upper bits are the protocol */ + prot = (protport >> RTL4_A_PROTOCOL_SHIFT) & 0x0f; + if (prot != RTL4_A_PROTOCOL_RTL8366RB) { + netdev_err(dev, "unknown realtek protocol 0x%01x\n", prot); + return NULL; + } + port = protport & 0xff; + + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) { + netdev_dbg(dev, "could not find slave for port %d\n", port); + return NULL; + } + + /* Remove RTL4 tag and recalculate checksum */ + skb_pull_rcsum(skb, RTL4_A_HDR_LEN); + + /* Move ethernet DA and SA in front of the data */ + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - RTL4_A_HDR_LEN, + 2 * ETH_ALEN); + + skb->offload_fwd_mark = 1; + + return skb; +} + +static int rtl4a_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto, + int *offset) +{ + *offset = RTL4_A_HDR_LEN; + /* Skip past the tag and fetch the encapsulated Ethertype */ + *proto = ((__be16 *)skb->data)[1]; + + return 0; +} + +static const struct dsa_device_ops rtl4a_netdev_ops = { + .name = "rtl4a", + .proto = DSA_TAG_PROTO_RTL4_A, + .xmit = rtl4a_tag_xmit, + .rcv = rtl4a_tag_rcv, + .flow_dissect = rtl4a_tag_flow_dissect, + .overhead = RTL4_A_HDR_LEN, +}; +module_dsa_tag_driver(rtl4a_netdev_ops); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL4_A); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 0c2b94f20499..7a849ff22dad 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -6,4 +6,5 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ - channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o + channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ + tunnels.o diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 7194956aa09e..888f6e101f34 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -58,6 +58,7 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_CABLE_TEST_MAX + 1]; struct ethnl_req_info req_info = {}; + const struct ethtool_phy_ops *ops; struct net_device *dev; int ret; @@ -81,11 +82,17 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) } rtnl_lock(); + ops = ethtool_phy_ops; + if (!ops || !ops->start_cable_test) { + ret = -EOPNOTSUPP; + goto out_rtnl; + } + ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; - ret = phy_start_cable_test(dev->phydev, info->extack); + ret = ops->start_cable_test(dev->phydev, info->extack); ethnl_ops_complete(dev); @@ -308,6 +315,7 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1]; struct ethnl_req_info req_info = {}; + const struct ethtool_phy_ops *ops; struct phy_tdr_config cfg; struct net_device *dev; int ret; @@ -337,11 +345,17 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) goto out_dev_put; rtnl_lock(); + ops = ethtool_phy_ops; + if (!ops || !ops->start_cable_test_tdr) { + ret = -EOPNOTSUPP; + goto out_rtnl; + } + ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; - ret = phy_start_cable_test_tdr(dev->phydev, info->extack, &cfg); + ret = ops->start_cable_test_tdr(dev->phydev, info->extack, &cfg); ethnl_ops_complete(dev); diff --git a/net/ethtool/common.c b/net/ethtool/common.c index aaecfc916a4d..ed19573fccd7 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/ethtool_netlink.h> #include <linux/net_tstamp.h> #include <linux/phy.h> +#include <linux/rtnetlink.h> #include "common.h" @@ -175,6 +177,21 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(400000, DR8, Full), __DEFINE_LINK_MODE_NAME(400000, CR8, Full), __DEFINE_SPECIAL_MODE_NAME(FEC_LLRS, "LLRS"), + __DEFINE_LINK_MODE_NAME(100000, KR, Full), + __DEFINE_LINK_MODE_NAME(100000, SR, Full), + __DEFINE_LINK_MODE_NAME(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_NAME(100000, DR, Full), + __DEFINE_LINK_MODE_NAME(100000, CR, Full), + __DEFINE_LINK_MODE_NAME(200000, KR2, Full), + __DEFINE_LINK_MODE_NAME(200000, SR2, Full), + __DEFINE_LINK_MODE_NAME(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_NAME(200000, DR2, Full), + __DEFINE_LINK_MODE_NAME(200000, CR2, Full), + __DEFINE_LINK_MODE_NAME(400000, KR4, Full), + __DEFINE_LINK_MODE_NAME(400000, SR4, Full), + __DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_NAME(400000, DR4, Full), + __DEFINE_LINK_MODE_NAME(400000, CR4, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -256,6 +273,14 @@ const char ts_rx_filter_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT); +const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = { + [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan", + [ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve", + [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE] = "vxlan-gpe", +}; +static_assert(ARRAY_SIZE(udp_tunnel_type_names) == + __ETHTOOL_UDP_TUNNEL_TYPE_CNT); + /* return false if legacy contained non-0 deprecated fields * maxtxpkt/maxrxpkt. rest of ksettings always updated */ @@ -373,3 +398,13 @@ int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) return 0; } + +const struct ethtool_phy_ops *ethtool_phy_ops; + +void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops) +{ + rtnl_lock(); + ethtool_phy_ops = ops; + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(ethtool_set_ethtool_phy_ops); diff --git a/net/ethtool/common.h b/net/ethtool/common.h index a62f68ccc43a..3d9251c95a8b 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -28,6 +28,7 @@ extern const char wol_mode_names[][ETH_GSTRING_LEN]; extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; extern const char ts_tx_type_names[][ETH_GSTRING_LEN]; extern const char ts_rx_filter_names[][ETH_GSTRING_LEN]; +extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); @@ -37,4 +38,6 @@ bool convert_legacy_settings_to_link_ksettings( int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max); int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info); +extern const struct ethtool_phy_ops *ethtool_phy_ops; + #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 21d5fc0f6bb3..441794e0034f 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -135,6 +135,7 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) static int __ethtool_get_sset_count(struct net_device *dev, int sset) { + const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; if (sset == ETH_SS_FEATURES) @@ -150,8 +151,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) return ARRAY_SIZE(phy_tunable_strings); if (sset == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) - return phy_ethtool_get_sset_count(dev->phydev); + !ops->get_ethtool_phy_stats && + phy_ops && phy_ops->get_sset_count) + return phy_ops->get_sset_count(dev->phydev); if (sset == ETH_SS_LINK_MODES) return __ETHTOOL_LINK_MODE_MASK_NBITS; @@ -165,6 +167,7 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) static void __ethtool_get_strings(struct net_device *dev, u32 stringset, u8 *data) { + const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; if (stringset == ETH_SS_FEATURES) @@ -178,8 +181,9 @@ static void __ethtool_get_strings(struct net_device *dev, else if (stringset == ETH_SS_PHY_TUNABLES) memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); else if (stringset == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) - phy_ethtool_get_strings(dev->phydev, data); + !ops->get_ethtool_phy_stats && phy_ops && + phy_ops->get_strings) + phy_ops->get_strings(dev->phydev, data); else if (stringset == ETH_SS_LINK_MODES) memcpy(data, link_mode_names, __ETHTOOL_LINK_MODE_MASK_NBITS * ETH_GSTRING_LEN); @@ -1918,7 +1922,7 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); - if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) + if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) goto out; ret = 0; @@ -1929,6 +1933,7 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) { + const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; struct ethtool_stats stats; @@ -1938,8 +1943,9 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count)) return -EOPNOTSUPP; - if (dev->phydev && !ops->get_ethtool_phy_stats) - n_stats = phy_ethtool_get_sset_count(dev->phydev); + if (dev->phydev && !ops->get_ethtool_phy_stats && + phy_ops && phy_ops->get_sset_count) + n_stats = phy_ops->get_sset_count(dev->phydev); else n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); if (n_stats < 0) @@ -1958,8 +1964,9 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (!data) return -ENOMEM; - if (dev->phydev && !ops->get_ethtool_phy_stats) { - ret = phy_ethtool_get_stats(dev->phydev, &stats, data); + if (dev->phydev && !ops->get_ethtool_phy_stats && + phy_ops && phy_ops->get_stats) { + ret = phy_ops->get_stats(dev->phydev, &stats, data); if (ret < 0) goto out; } else { @@ -1973,7 +1980,7 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); - if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) + if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) goto out; ret = 0; diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index fd4f3e58c6f6..7044a2853886 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -257,6 +257,21 @@ static const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), + __DEFINE_LINK_MODE_PARAMS(100000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), }; static const struct nla_policy @@ -406,8 +421,7 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { - if (info) - GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); goto out_ops; } diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index afe5ac8a0f00..4834091ec24c 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -9,10 +9,12 @@ struct linkstate_req_info { }; struct linkstate_reply_data { - struct ethnl_reply_data base; - int link; - int sqi; - int sqi_max; + struct ethnl_reply_data base; + int link; + int sqi; + int sqi_max; + bool link_ext_state_provided; + struct ethtool_link_ext_state_info ethtool_link_ext_state_info; }; #define LINKSTATE_REPDATA(__reply_base) \ @@ -25,6 +27,8 @@ linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_SQI] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_SQI_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_EXT_STATE] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_EXT_SUBSTATE] = { .type = NLA_REJECT }, }; static int linkstate_get_sqi(struct net_device *dev) @@ -61,6 +65,23 @@ static int linkstate_get_sqi_max(struct net_device *dev) mutex_unlock(&phydev->lock); return ret; +}; + +static int linkstate_get_link_ext_state(struct net_device *dev, + struct linkstate_reply_data *data) +{ + int err; + + if (!dev->ethtool_ops->get_link_ext_state) + return -EOPNOTSUPP; + + err = dev->ethtool_ops->get_link_ext_state(dev, &data->ethtool_link_ext_state_info); + if (err) + return err; + + data->link_ext_state_provided = true; + + return 0; } static int linkstate_prepare_data(const struct ethnl_req_info *req_base, @@ -86,6 +107,12 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, goto out; data->sqi_max = ret; + if (dev->flags & IFF_UP) { + ret = linkstate_get_link_ext_state(dev, data); + if (ret < 0 && ret != -EOPNOTSUPP && ret != -ENODATA) + goto out; + } + ret = 0; out: ethnl_ops_complete(dev); @@ -107,6 +134,12 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base, if (data->sqi_max != -EOPNOTSUPP) len += nla_total_size(sizeof(u32)); + if (data->link_ext_state_provided) + len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */ + + if (data->ethtool_link_ext_state_info.__link_ext_substate) + len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */ + return len; } @@ -128,6 +161,17 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) return -EMSGSIZE; + if (data->link_ext_state_provided) { + if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE, + data->ethtool_link_ext_state_info.link_ext_state)) + return -EMSGSIZE; + + if (data->ethtool_link_ext_state_info.__link_ext_substate && + nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, + data->ethtool_link_ext_state_info.__link_ext_substate)) + return -EMSGSIZE; + } + return 0; } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index dd8a1c1dc07d..5c2072765be7 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -181,6 +181,12 @@ err: return NULL; } +void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd) +{ + return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + ðtool_genl_family, 0, cmd); +} + void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) { return genlmsg_put(skb, 0, ++ethnl_bcast_seq, ðtool_genl_family, 0, @@ -848,6 +854,12 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test_tdr, }, + { + .cmd = ETHTOOL_MSG_TUNNEL_INFO_GET, + .doit = ethnl_tunnel_info_doit, + .start = ethnl_tunnel_info_start, + .dumpit = ethnl_tunnel_info_dumpit, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 9a96b6e90dc2..e2085005caac 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -19,6 +19,7 @@ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); +void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd); void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); @@ -361,5 +362,8 @@ int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); +int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); +int ethnl_tunnel_info_start(struct netlink_callback *cb); +int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 0eed4e4909ab..82707b662fe4 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -75,6 +75,11 @@ static const struct strset_info info_template[] = { .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, + [ETH_SS_UDP_TUNNEL_TYPES] = { + .per_dev = false, + .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + .strings = udp_tunnel_type_names, + }, }; struct strset_req_info { @@ -209,13 +214,15 @@ static void strset_cleanup_data(struct ethnl_reply_data *reply_base) static int strset_prepare_set(struct strset_info *info, struct net_device *dev, unsigned int id, bool counts_only) { + const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; void *strings; int count, ret; if (id == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) - ret = phy_ethtool_get_sset_count(dev->phydev); + !ops->get_ethtool_phy_stats && phy_ops && + phy_ops->get_sset_count) + ret = phy_ops->get_sset_count(dev->phydev); else if (ops->get_sset_count && ops->get_strings) ret = ops->get_sset_count(dev, id); else @@ -231,8 +238,9 @@ static int strset_prepare_set(struct strset_info *info, struct net_device *dev, if (!strings) return -ENOMEM; if (id == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) - phy_ethtool_get_strings(dev->phydev, strings); + !ops->get_ethtool_phy_stats && phy_ops && + phy_ops->get_strings) + phy_ops->get_strings(dev->phydev, strings); else ops->get_strings(dev, id, strings); info->strings = strings; diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c new file mode 100644 index 000000000000..84f23289475b --- /dev/null +++ b/net/ethtool/tunnels.c @@ -0,0 +1,312 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool_netlink.h> +#include <net/udp_tunnel.h> +#include <net/vxlan.h> + +#include "bitset.h" +#include "common.h" +#include "netlink.h" + +static const struct nla_policy +ethtool_tunnel_info_policy[ETHTOOL_A_TUNNEL_INFO_MAX + 1] = { + [ETHTOOL_A_TUNNEL_INFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_TUNNEL_INFO_HEADER] = { .type = NLA_NESTED }, +}; + +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN == ilog2(UDP_TUNNEL_TYPE_VXLAN)); +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_GENEVE == ilog2(UDP_TUNNEL_TYPE_GENEVE)); +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE == + ilog2(UDP_TUNNEL_TYPE_VXLAN_GPE)); + +static ssize_t ethnl_udp_table_reply_size(unsigned int types, bool compact) +{ + ssize_t size; + + size = ethnl_bitset32_size(&types, NULL, __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact); + if (size < 0) + return size; + + return size + + nla_total_size(0) + /* _UDP_TABLE */ + nla_total_size(sizeof(u32)); /* _UDP_TABLE_SIZE */ +} + +static ssize_t +ethnl_tunnel_info_reply_size(const struct ethnl_req_info *req_base, + struct netlink_ext_ack *extack) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct udp_tunnel_nic_info *info; + unsigned int i; + ssize_t ret; + size_t size; + + info = req_base->dev->udp_tunnel_nic_info; + if (!info) { + NL_SET_ERR_MSG(extack, + "device does not report tunnel offload info"); + return -EOPNOTSUPP; + } + + size = nla_total_size(0); /* _INFO_UDP_PORTS */ + + for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + break; + + ret = ethnl_udp_table_reply_size(info->tables[i].tunnel_types, + compact); + if (ret < 0) + return ret; + size += ret; + + size += udp_tunnel_nic_dump_size(req_base->dev, i); + } + + if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) { + ret = ethnl_udp_table_reply_size(0, compact); + if (ret < 0) + return ret; + size += ret; + + size += nla_total_size(0) + /* _TABLE_ENTRY */ + nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ + nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ + } + + return size; +} + +static int +ethnl_tunnel_info_fill_reply(const struct ethnl_req_info *req_base, + struct sk_buff *skb) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct udp_tunnel_nic_info *info; + struct nlattr *ports, *table, *entry; + unsigned int i; + + info = req_base->dev->udp_tunnel_nic_info; + if (!info) + return -EOPNOTSUPP; + + ports = nla_nest_start(skb, ETHTOOL_A_TUNNEL_INFO_UDP_PORTS); + if (!ports) + return -EMSGSIZE; + + for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + break; + + table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE); + if (!table) + goto err_cancel_ports; + + if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, + info->tables[i].n_entries)) + goto err_cancel_table; + + if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, + &info->tables[i].tunnel_types, NULL, + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact)) + goto err_cancel_table; + + if (udp_tunnel_nic_dump_write(req_base->dev, i, skb)) + goto err_cancel_table; + + nla_nest_end(skb, table); + } + + if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) { + u32 zero = 0; + + table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE); + if (!table) + goto err_cancel_ports; + + if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, 1)) + goto err_cancel_table; + + if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, + &zero, NULL, + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact)) + goto err_cancel_table; + + entry = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); + + if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, + htons(IANA_VXLAN_UDP_PORT)) || + nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, + ilog2(UDP_TUNNEL_TYPE_VXLAN))) + goto err_cancel_entry; + + nla_nest_end(skb, entry); + nla_nest_end(skb, table); + } + + nla_nest_end(skb, ports); + + return 0; + +err_cancel_entry: + nla_nest_cancel(skb, entry); +err_cancel_table: + nla_nest_cancel(skb, table); +err_cancel_ports: + nla_nest_cancel(skb, ports); + return -EMSGSIZE; +} + +static int +ethnl_tunnel_info_req_parse(struct ethnl_req_info *req_info, + const struct nlmsghdr *nlhdr, struct net *net, + struct netlink_ext_ack *extack, bool require_dev) +{ + struct nlattr *tb[ETHTOOL_A_TUNNEL_INFO_MAX + 1]; + int ret; + + ret = nlmsg_parse(nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_TUNNEL_INFO_MAX, + ethtool_tunnel_info_policy, extack); + if (ret < 0) + return ret; + + return ethnl_parse_header_dev_get(req_info, + tb[ETHTOOL_A_TUNNEL_INFO_HEADER], + net, extack, require_dev); +} + +int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_req_info req_info = {}; + struct sk_buff *rskb; + void *reply_payload; + int reply_len; + int ret; + + ret = ethnl_tunnel_info_req_parse(&req_info, info->nlhdr, + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + rtnl_lock(); + ret = ethnl_tunnel_info_reply_size(&req_info, info->extack); + if (ret < 0) + goto err_unlock_rtnl; + reply_len = ret + ethnl_reply_header_size(); + + rskb = ethnl_reply_init(reply_len, req_info.dev, + ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_A_TUNNEL_INFO_HEADER, + info, &reply_payload); + if (!rskb) { + ret = -ENOMEM; + goto err_unlock_rtnl; + } + + ret = ethnl_tunnel_info_fill_reply(&req_info, rskb); + if (ret) + goto err_free_msg; + rtnl_unlock(); + dev_put(req_info.dev); + genlmsg_end(rskb, reply_payload); + + return genlmsg_reply(rskb, info); + +err_free_msg: + nlmsg_free(rskb); +err_unlock_rtnl: + rtnl_unlock(); + dev_put(req_info.dev); + return ret; +} + +struct ethnl_tunnel_info_dump_ctx { + struct ethnl_req_info req_info; + int pos_hash; + int pos_idx; +}; + +int ethnl_tunnel_info_start(struct netlink_callback *cb) +{ + struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + memset(ctx, 0, sizeof(*ctx)); + + ret = ethnl_tunnel_info_req_parse(&ctx->req_info, cb->nlh, + sock_net(cb->skb->sk), cb->extack, + false); + if (ctx->req_info.dev) { + dev_put(ctx->req_info.dev); + ctx->req_info.dev = NULL; + } + + return ret; +} + +int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + int s_idx = ctx->pos_idx; + int h, idx = 0; + int ret = 0; + void *ehdr; + + rtnl_lock(); + cb->seq = net->dev_base_seq; + for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + struct hlist_head *head; + struct net_device *dev; + + head = &net->dev_index_head[h]; + idx = 0; + hlist_for_each_entry(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TUNNEL_INFO_GET); + if (!ehdr) { + ret = -EMSGSIZE; + goto out; + } + + ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TUNNEL_INFO_HEADER); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + goto out; + } + + ctx->req_info.dev = dev; + ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); + ctx->req_info.dev = NULL; + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + goto cont; + goto out; + } + genlmsg_end(skb, ehdr); +cont: + idx++; + } + } +out: + rtnl_unlock(); + + ctx->pos_hash = h; + ctx->pos_idx = idx; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + if (ret == -EMSGSIZE && skb->len) + return skb->len; + return ret; +} diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig index 8095b034e76e..1b048c17b6c8 100644 --- a/net/hsr/Kconfig +++ b/net/hsr/Kconfig @@ -4,24 +4,35 @@ # config HSR - tristate "High-availability Seamless Redundancy (HSR)" + tristate "High-availability Seamless Redundancy (HSR & PRP)" help + This enables IEC 62439 defined High-availability Seamless + Redundancy (HSR) and Parallel Redundancy Protocol (PRP). + If you say Y here, then your Linux box will be able to act as a - DANH ("Doubly attached node implementing HSR"). For this to work, - your Linux box needs (at least) two physical Ethernet interfaces, - and it must be connected as a node in a ring network together with - other HSR capable nodes. + DANH ("Doubly attached node implementing HSR") or DANP ("Doubly + attached node implementing PRP"). For this to work, your Linux box + needs (at least) two physical Ethernet interfaces. + + For DANH, it must be connected as a node in a ring network together + with other HSR capable nodes. All Ethernet frames sent over the HSR + device will be sent in both directions on the ring (over both slave + ports), giving a redundant, instant fail-over network. Each HSR node + in the ring acts like a bridge for HSR frames, but filters frames + that have been forwarded earlier. - All Ethernet frames sent over the hsr device will be sent in both - directions on the ring (over both slave ports), giving a redundant, - instant fail-over network. Each HSR node in the ring acts like a - bridge for HSR frames, but filters frames that have been forwarded - earlier. + For DANP, it must be connected as a node connecting to two + separate networks over the two slave interfaces. Like HSR, Ethernet + frames sent over the PRP device will be sent to both networks giving + a redundant, instant fail-over network. Unlike HSR, PRP networks + can have Singly Attached Nodes (SAN) such as PC, printer, bridges + etc and will be able to communicate with DANP nodes. This code is a "best effort" to comply with the HSR standard as described in IEC 62439-3:2010 (HSRv0) and IEC 62439-3:2012 (HSRv1), - but no compliancy tests have been made. Use iproute2 to select - the version you desire. + and PRP standard described in IEC 62439-4:2012 (PRP), but no + compliancy tests have been made. Use iproute2 to select the protocol + you would like to use. You need to perform any and all necessary tests yourself before relying on this code in a safety critical system! diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 9787ef11ca71..7e11a6c35bc3 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -1,5 +1,5 @@ /* - * hsr_debugfs code + * debugfs code for HSR & PRP * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): @@ -22,12 +22,6 @@ static struct dentry *hsr_debugfs_root_dir; -static void print_mac_address(struct seq_file *sfp, unsigned char *mac) -{ - seq_printf(sfp, "%02x:%02x:%02x:%02x:%02x:%02x:", - mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); -} - /* hsr_node_table_show - Formats and prints node_table entries */ static int hsr_node_table_show(struct seq_file *sfp, void *data) @@ -35,20 +29,32 @@ hsr_node_table_show(struct seq_file *sfp, void *data) struct hsr_priv *priv = (struct hsr_priv *)sfp->private; struct hsr_node *node; - seq_puts(sfp, "Node Table entries\n"); - seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], "); - seq_puts(sfp, "time_in[B], Address-B port\n"); + seq_printf(sfp, "Node Table entries for (%s) device\n", + (priv->prot_version == PRP_V1 ? "PRP" : "HSR")); + seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], "); + seq_puts(sfp, "time_in[B], Address-B port, "); + if (priv->prot_version == PRP_V1) + seq_puts(sfp, "SAN-A, SAN-B, DAN-P\n"); + else + seq_puts(sfp, "DAN-H\n"); + rcu_read_lock(); list_for_each_entry_rcu(node, &priv->node_db, mac_list) { /* skip self node */ if (hsr_addr_is_self(priv, node->macaddress_A)) continue; - print_mac_address(sfp, &node->macaddress_A[0]); - seq_puts(sfp, " "); - print_mac_address(sfp, &node->macaddress_B[0]); - seq_printf(sfp, "0x%lx, ", node->time_in[HSR_PT_SLAVE_A]); - seq_printf(sfp, "0x%lx ", node->time_in[HSR_PT_SLAVE_B]); - seq_printf(sfp, "0x%x\n", node->addr_B_port); + seq_printf(sfp, "%pM ", &node->macaddress_A[0]); + seq_printf(sfp, "%pM ", &node->macaddress_B[0]); + seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]); + seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]); + seq_printf(sfp, "%14x, ", node->addr_B_port); + + if (priv->prot_version == PRP_V1) + seq_printf(sfp, "%5x, %5x, %5x\n", + node->san_a, node->san_b, + (node->san_a == 0 && node->san_b == 0)); + else + seq_printf(sfp, "%5x\n", 1); } rcu_read_unlock(); return 0; @@ -57,7 +63,8 @@ hsr_node_table_show(struct seq_file *sfp, void *data) /* hsr_node_table_open - Open the node_table file * * Description: - * This routine opens a debugfs file node_table of specific hsr device + * This routine opens a debugfs file node_table of specific hsr + * or prp device */ static int hsr_node_table_open(struct inode *inode, struct file *filp) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index a6f4e9f65b14..ab953a1a0d6c 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -3,9 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se - * * This file contains device methods for creating, using and destroying - * virtual HSR devices. + * virtual HSR or PRP devices. */ #include <linux/netdevice.h> @@ -210,7 +209,7 @@ static netdev_features_t hsr_fix_features(struct net_device *dev, return hsr_features_recompute(hsr, features); } -static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); struct hsr_port *master; @@ -231,66 +230,103 @@ static const struct header_ops hsr_header_ops = { .parse = eth_header_parse, }; -static void send_hsr_supervision_frame(struct hsr_port *master, - u8 type, u8 hsr_ver) +static struct sk_buff *hsr_init_skb(struct hsr_port *master, u16 proto) { + struct hsr_priv *hsr = master->hsr; struct sk_buff *skb; int hlen, tlen; - struct hsr_tag *hsr_tag; - struct hsr_sup_tag *hsr_stag; - struct hsr_sup_payload *hsr_sp; - unsigned long irqflags; hlen = LL_RESERVED_SPACE(master->dev); tlen = master->dev->needed_tailroom; + /* skb size is same for PRP/HSR frames, only difference + * being, for PRP it is a trailer and for HSR it is a + * header + */ skb = dev_alloc_skb(sizeof(struct hsr_tag) + sizeof(struct hsr_sup_tag) + sizeof(struct hsr_sup_payload) + hlen + tlen); if (!skb) - return; + return skb; skb_reserve(skb, hlen); - skb->dev = master->dev; - skb->protocol = htons(hsr_ver ? ETH_P_HSR : ETH_P_PRP); + skb->protocol = htons(proto); skb->priority = TC_PRIO_CONTROL; - if (dev_hard_header(skb, skb->dev, (hsr_ver ? ETH_P_HSR : ETH_P_PRP), - master->hsr->sup_multicast_addr, + if (dev_hard_header(skb, skb->dev, proto, + hsr->sup_multicast_addr, skb->dev->dev_addr, skb->len) <= 0) goto out; + skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); - if (hsr_ver > 0) { + return skb; +out: + kfree_skb(skb); + + return NULL; +} + +static void send_hsr_supervision_frame(struct hsr_port *master, + unsigned long *interval) +{ + struct hsr_priv *hsr = master->hsr; + __u8 type = HSR_TLV_LIFE_CHECK; + struct hsr_tag *hsr_tag = NULL; + struct hsr_sup_payload *hsr_sp; + struct hsr_sup_tag *hsr_stag; + unsigned long irqflags; + struct sk_buff *skb; + u16 proto; + + *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); + if (hsr->announce_count < 3 && hsr->prot_version == 0) { + type = HSR_TLV_ANNOUNCE; + *interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); + hsr->announce_count++; + } + + if (!hsr->prot_version) + proto = ETH_P_PRP; + else + proto = ETH_P_HSR; + + skb = hsr_init_skb(master, proto); + if (!skb) { + WARN_ONCE(1, "HSR: Could not send supervision frame\n"); + return; + } + + if (hsr->prot_version > 0) { hsr_tag = skb_put(skb, sizeof(struct hsr_tag)); hsr_tag->encap_proto = htons(ETH_P_PRP); set_hsr_tag_LSDU_size(hsr_tag, HSR_V1_SUP_LSDUSIZE); } hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); - set_hsr_stag_path(hsr_stag, (hsr_ver ? 0x0 : 0xf)); - set_hsr_stag_HSR_ver(hsr_stag, hsr_ver); + set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); + set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version); /* From HSRv1 on we have separate supervision sequence numbers. */ spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags); - if (hsr_ver > 0) { - hsr_stag->sequence_nr = htons(master->hsr->sup_sequence_nr); - hsr_tag->sequence_nr = htons(master->hsr->sequence_nr); - master->hsr->sup_sequence_nr++; - master->hsr->sequence_nr++; + if (hsr->prot_version > 0) { + hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); + hsr->sup_sequence_nr++; + hsr_tag->sequence_nr = htons(hsr->sequence_nr); + hsr->sequence_nr++; } else { - hsr_stag->sequence_nr = htons(master->hsr->sequence_nr); - master->hsr->sequence_nr++; + hsr_stag->sequence_nr = htons(hsr->sequence_nr); + hsr->sequence_nr++; } spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); hsr_stag->HSR_TLV_type = type; /* TODO: Why 12 in HSRv0? */ - hsr_stag->HSR_TLV_length = - hsr_ver ? sizeof(struct hsr_sup_payload) : 12; + hsr_stag->HSR_TLV_length = hsr->prot_version ? + sizeof(struct hsr_sup_payload) : 12; /* Payload: MacAddressA */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); @@ -300,11 +336,57 @@ static void send_hsr_supervision_frame(struct hsr_port *master, return; hsr_forward_skb(skb, master); + return; +} -out: - WARN_ONCE(1, "HSR: Could not send supervision frame\n"); - kfree_skb(skb); +static void send_prp_supervision_frame(struct hsr_port *master, + unsigned long *interval) +{ + struct hsr_priv *hsr = master->hsr; + struct hsr_sup_payload *hsr_sp; + struct hsr_sup_tag *hsr_stag; + unsigned long irqflags; + struct sk_buff *skb; + struct prp_rct *rct; + u8 *tail; + + skb = hsr_init_skb(master, ETH_P_PRP); + if (!skb) { + WARN_ONCE(1, "PRP: Could not send supervision frame\n"); + return; + } + + *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); + hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); + set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); + set_hsr_stag_HSR_ver(hsr_stag, (hsr->prot_version ? 1 : 0)); + + /* From HSRv1 on we have separate supervision sequence numbers. */ + spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags); + hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); + hsr->sup_sequence_nr++; + hsr_stag->HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD; + hsr_stag->HSR_TLV_length = sizeof(struct hsr_sup_payload); + + /* Payload: MacAddressA */ + hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); + ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); + + if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) { + spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); + return; + } + + tail = skb_tail_pointer(skb) - HSR_HLEN; + rct = (struct prp_rct *)tail; + rct->PRP_suffix = htons(ETH_P_PRP); + set_prp_LSDU_size(rct, HSR_V1_SUP_LSDUSIZE); + rct->sequence_nr = htons(hsr->sequence_nr); + hsr->sequence_nr++; + spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); + + hsr_forward_skb(skb, master); } /* Announce (supervision frame) timer function @@ -319,19 +401,7 @@ static void hsr_announce(struct timer_list *t) rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); - - if (hsr->announce_count < 3 && hsr->prot_version == 0) { - send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE, - hsr->prot_version); - hsr->announce_count++; - - interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); - } else { - send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK, - hsr->prot_version); - - interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); - } + hsr->proto_ops->send_sv_frame(master, &interval); if (is_admin_up(master->dev)) mod_timer(&hsr->announce_timer, jiffies + interval); @@ -368,6 +438,24 @@ static struct device_type hsr_type = { .name = "hsr", }; +static struct hsr_proto_ops hsr_ops = { + .send_sv_frame = send_hsr_supervision_frame, + .create_tagged_frame = hsr_create_tagged_frame, + .get_untagged_frame = hsr_get_untagged_frame, + .fill_frame_info = hsr_fill_frame_info, + .invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame, +}; + +static struct hsr_proto_ops prp_ops = { + .send_sv_frame = send_prp_supervision_frame, + .create_tagged_frame = prp_create_tagged_frame, + .get_untagged_frame = prp_get_untagged_frame, + .drop_frame = prp_drop_frame, + .fill_frame_info = prp_fill_frame_info, + .handle_san_frame = prp_handle_san_frame, + .update_san_info = prp_update_san_info, +}; + void hsr_dev_setup(struct net_device *dev) { eth_hw_addr_random(dev); @@ -427,6 +515,17 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); + /* initialize protocol specific functions */ + if (protocol_version == PRP_V1) { + /* For PRP, lan_id has most significant 3 bits holding + * the net_id of PRP_LAN_ID + */ + hsr->net_id = PRP_LAN_ID << 1; + hsr->proto_ops = &prp_ops; + } else { + hsr->proto_ops = &hsr_ops; + } + /* Make sure we recognize frames from ourselves in hsr_rcv() */ res = hsr_create_self_node(hsr, hsr_dev->dev_addr, slave[1]->dev_addr); diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index b8f9262ed101..868373822ee4 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_DEVICE_H diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 1ea17752fffc..cadfccd7876e 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Frame router for HSR and PRP. */ #include "hsr_forward.h" @@ -15,18 +17,6 @@ struct hsr_node; -struct hsr_frame_info { - struct sk_buff *skb_std; - struct sk_buff *skb_hsr; - struct hsr_port *port_rcv; - struct hsr_node *node_src; - u16 sequence_nr; - bool is_supervision; - bool is_vlan; - bool is_local_dest; - bool is_local_exclusive; -}; - /* The uses I can see for these HSR supervision frames are: * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type = * 22") to reset any sequence_nr counters belonging to that node. Useful if @@ -74,7 +64,9 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) } if (hsr_sup_tag->HSR_TLV_type != HSR_TLV_ANNOUNCE && - hsr_sup_tag->HSR_TLV_type != HSR_TLV_LIFE_CHECK) + hsr_sup_tag->HSR_TLV_type != HSR_TLV_LIFE_CHECK && + hsr_sup_tag->HSR_TLV_type != PRP_TLV_LIFE_CHECK_DD && + hsr_sup_tag->HSR_TLV_type != PRP_TLV_LIFE_CHECK_DA) return false; if (hsr_sup_tag->HSR_TLV_length != 12 && hsr_sup_tag->HSR_TLV_length != sizeof(struct hsr_sup_payload)) @@ -83,8 +75,8 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) return true; } -static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in, - struct hsr_frame_info *frame) +static struct sk_buff *create_stripped_skb_hsr(struct sk_buff *skb_in, + struct hsr_frame_info *frame) { struct sk_buff *skb; int copylen; @@ -112,38 +104,123 @@ static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in, return skb; } -static struct sk_buff *frame_get_stripped_skb(struct hsr_frame_info *frame, - struct hsr_port *port) +struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port) { - if (!frame->skb_std) - frame->skb_std = create_stripped_skb(frame->skb_hsr, frame); + if (!frame->skb_std) { + if (frame->skb_hsr) { + frame->skb_std = + create_stripped_skb_hsr(frame->skb_hsr, frame); + } else { + /* Unexpected */ + WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n", + __FILE__, __LINE__, port->dev->name); + return NULL; + } + } + return skb_clone(frame->skb_std, GFP_ATOMIC); } +struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port) +{ + if (!frame->skb_std) { + if (frame->skb_prp) { + /* trim the skb by len - HSR_HLEN to exclude RCT */ + skb_trim(frame->skb_prp, + frame->skb_prp->len - HSR_HLEN); + frame->skb_std = + __pskb_copy(frame->skb_prp, + skb_headroom(frame->skb_prp), + GFP_ATOMIC); + } else { + /* Unexpected */ + WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n", + __FILE__, __LINE__, port->dev->name); + return NULL; + } + } + + return skb_clone(frame->skb_std, GFP_ATOMIC); +} + +static void prp_set_lan_id(struct prp_rct *trailer, + struct hsr_port *port) +{ + int lane_id; + + if (port->type == HSR_PT_SLAVE_A) + lane_id = 0; + else + lane_id = 1; + + /* Add net_id in the upper 3 bits of lane_id */ + lane_id |= port->hsr->net_id; + set_prp_lan_id(trailer, lane_id); +} + +/* Tailroom for PRP rct should have been created before calling this */ +static struct sk_buff *prp_fill_rct(struct sk_buff *skb, + struct hsr_frame_info *frame, + struct hsr_port *port) +{ + struct prp_rct *trailer; + int min_size = ETH_ZLEN; + int lsdu_size; + + if (!skb) + return skb; + + if (frame->is_vlan) + min_size = VLAN_ETH_ZLEN; + + if (skb_put_padto(skb, min_size)) + return NULL; + + trailer = (struct prp_rct *)skb_put(skb, HSR_HLEN); + lsdu_size = skb->len - 14; + if (frame->is_vlan) + lsdu_size -= 4; + prp_set_lan_id(trailer, port); + set_prp_LSDU_size(trailer, lsdu_size); + trailer->sequence_nr = htons(frame->sequence_nr); + trailer->PRP_suffix = htons(ETH_P_PRP); + + return skb; +} + +static void hsr_set_path_id(struct hsr_ethhdr *hsr_ethhdr, + struct hsr_port *port) +{ + int path_id; + + if (port->type == HSR_PT_SLAVE_A) + path_id = 0; + else + path_id = 1; + + set_hsr_tag_path(&hsr_ethhdr->hsr_tag, path_id); +} + static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, struct hsr_port *port, u8 proto_version) { struct hsr_ethhdr *hsr_ethhdr; - int lane_id; int lsdu_size; /* pad to minimum packet size which is 60 + 6 (HSR tag) */ if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) return NULL; - if (port->type == HSR_PT_SLAVE_A) - lane_id = 0; - else - lane_id = 1; - lsdu_size = skb->len - 14; if (frame->is_vlan) lsdu_size -= 4; hsr_ethhdr = (struct hsr_ethhdr *)skb_mac_header(skb); - set_hsr_tag_path(&hsr_ethhdr->hsr_tag, lane_id); + hsr_set_path_id(hsr_ethhdr, port); set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; @@ -153,16 +230,28 @@ static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, return skb; } -static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, - struct hsr_frame_info *frame, - struct hsr_port *port) +/* If the original frame was an HSR tagged frame, just clone it to be sent + * unchanged. Otherwise, create a private frame especially tagged for 'port'. + */ +struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port) { - int movelen; unsigned char *dst, *src; struct sk_buff *skb; + int movelen; + + if (frame->skb_hsr) { + struct hsr_ethhdr *hsr_ethhdr = + (struct hsr_ethhdr *)skb_mac_header(frame->skb_hsr); + + /* set the lane id properly */ + hsr_set_path_id(hsr_ethhdr, port); + return skb_clone(frame->skb_hsr, GFP_ATOMIC); + } /* Create the new skb with enough headroom to fit the HSR tag */ - skb = __pskb_copy(skb_o, skb_headroom(skb_o) + HSR_HLEN, GFP_ATOMIC); + skb = __pskb_copy(frame->skb_std, + skb_headroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC); if (!skb) return NULL; skb_reset_mac_header(skb); @@ -185,21 +274,29 @@ static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, return hsr_fill_tag(skb, frame, port, port->hsr->prot_version); } -/* If the original frame was an HSR tagged frame, just clone it to be sent - * unchanged. Otherwise, create a private frame especially tagged for 'port'. - */ -static struct sk_buff *frame_get_tagged_skb(struct hsr_frame_info *frame, - struct hsr_port *port) +struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port) { - if (frame->skb_hsr) - return skb_clone(frame->skb_hsr, GFP_ATOMIC); + struct sk_buff *skb; - if (port->type != HSR_PT_SLAVE_A && port->type != HSR_PT_SLAVE_B) { - WARN_ONCE(1, "HSR: Bug: trying to create a tagged frame for a non-ring port"); - return NULL; + if (frame->skb_prp) { + struct prp_rct *trailer = skb_get_PRP_rct(frame->skb_prp); + + if (trailer) { + prp_set_lan_id(trailer, port); + } else { + WARN_ONCE(!trailer, "errored PRP skb"); + return NULL; + } + return skb_clone(frame->skb_prp, GFP_ATOMIC); } - return create_tagged_skb(frame->skb_std, frame, port); + skb = skb_copy_expand(frame->skb_std, 0, + skb_tailroom(frame->skb_std) + HSR_HLEN, + GFP_ATOMIC); + prp_fill_rct(skb, frame, port); + + return skb; } static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, @@ -236,9 +333,18 @@ static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port, return dev_queue_xmit(skb); } +bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) +{ + return ((frame->port_rcv->type == HSR_PT_SLAVE_A && + port->type == HSR_PT_SLAVE_B) || + (frame->port_rcv->type == HSR_PT_SLAVE_B && + port->type == HSR_PT_SLAVE_A)); +} + /* Forward the frame through all devices except: * - Back through the receiving device * - If it's a HSR frame: through a device where it has passed before + * - if it's a PRP frame: through another PRP slave device (no bridge) * - To the local HSR master only if the frame is directly addressed to it, or * a non-supervision multicast or broadcast frame. * @@ -253,6 +359,7 @@ static void hsr_forward_do(struct hsr_frame_info *frame) struct sk_buff *skb; hsr_for_each_port(frame->port_rcv->hsr, port) { + struct hsr_priv *hsr = port->hsr; /* Don't send frame back the way it came */ if (port == frame->port_rcv) continue; @@ -265,24 +372,33 @@ static void hsr_forward_do(struct hsr_frame_info *frame) if (port->type != HSR_PT_MASTER && frame->is_local_exclusive) continue; - /* Don't send frame over port where it has been sent before */ - if (hsr_register_frame_out(port, frame->node_src, + /* Don't send frame over port where it has been sent before. + * Also fro SAN, this shouldn't be done. + */ + if (!frame->is_from_san && + hsr_register_frame_out(port, frame->node_src, frame->sequence_nr)) continue; if (frame->is_supervision && port->type == HSR_PT_MASTER) { - hsr_handle_sup_frame(frame->skb_hsr, - frame->node_src, - frame->port_rcv); + hsr_handle_sup_frame(frame); continue; } + /* Check if frame is to be dropped. Eg. for PRP no forward + * between ports. + */ + if (hsr->proto_ops->drop_frame && + hsr->proto_ops->drop_frame(frame, port)) + continue; + if (port->type != HSR_PT_MASTER) - skb = frame_get_tagged_skb(frame, port); + skb = hsr->proto_ops->create_tagged_frame(frame, port); else - skb = frame_get_stripped_skb(frame, port); + skb = hsr->proto_ops->get_untagged_frame(frame, port); + if (!skb) { - /* FIXME: Record the dropped frame? */ + frame->port_rcv->dev->stats.rx_dropped++; continue; } @@ -313,40 +429,95 @@ static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb, } } -static int hsr_fill_frame_info(struct hsr_frame_info *frame, - struct sk_buff *skb, struct hsr_port *port) +static void handle_std_frame(struct sk_buff *skb, + struct hsr_frame_info *frame) { - struct ethhdr *ethhdr; + struct hsr_port *port = frame->port_rcv; + struct hsr_priv *hsr = port->hsr; unsigned long irqflags; + frame->skb_hsr = NULL; + frame->skb_prp = NULL; + frame->skb_std = skb; + + if (port->type != HSR_PT_MASTER) { + frame->is_from_san = true; + } else { + /* Sequence nr for the master node */ + spin_lock_irqsave(&hsr->seqnr_lock, irqflags); + frame->sequence_nr = hsr->sequence_nr; + hsr->sequence_nr++; + spin_unlock_irqrestore(&hsr->seqnr_lock, irqflags); + } +} + +void hsr_fill_frame_info(__be16 proto, struct sk_buff *skb, + struct hsr_frame_info *frame) +{ + if (proto == htons(ETH_P_PRP) || + proto == htons(ETH_P_HSR)) { + /* HSR tagged frame :- Data or Supervision */ + frame->skb_std = NULL; + frame->skb_prp = NULL; + frame->skb_hsr = skb; + frame->sequence_nr = hsr_get_skb_sequence_nr(skb); + return; + } + + /* Standard frame or PRP from master port */ + handle_std_frame(skb, frame); +} + +void prp_fill_frame_info(__be16 proto, struct sk_buff *skb, + struct hsr_frame_info *frame) +{ + /* Supervision frame */ + struct prp_rct *rct = skb_get_PRP_rct(skb); + + if (rct && + prp_check_lsdu_size(skb, rct, frame->is_supervision)) { + frame->skb_hsr = NULL; + frame->skb_std = NULL; + frame->skb_prp = skb; + frame->sequence_nr = prp_get_skb_sequence_nr(rct); + return; + } + handle_std_frame(skb, frame); +} + +static int fill_frame_info(struct hsr_frame_info *frame, + struct sk_buff *skb, struct hsr_port *port) +{ + struct hsr_priv *hsr = port->hsr; + struct hsr_vlan_ethhdr *vlan_hdr; + struct ethhdr *ethhdr; + __be16 proto; + + memset(frame, 0, sizeof(*frame)); frame->is_supervision = is_supervision_frame(port->hsr, skb); - frame->node_src = hsr_get_node(port, skb, frame->is_supervision); + frame->node_src = hsr_get_node(port, &hsr->node_db, skb, + frame->is_supervision, + port->type); if (!frame->node_src) return -1; /* Unknown node and !is_supervision, or no mem */ ethhdr = (struct ethhdr *)skb_mac_header(skb); frame->is_vlan = false; - if (ethhdr->h_proto == htons(ETH_P_8021Q)) { + proto = ethhdr->h_proto; + + if (proto == htons(ETH_P_8021Q)) frame->is_vlan = true; + + if (frame->is_vlan) { + vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; + proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; /* FIXME: */ netdev_warn_once(skb->dev, "VLAN not yet supported"); } - if (ethhdr->h_proto == htons(ETH_P_PRP) || - ethhdr->h_proto == htons(ETH_P_HSR)) { - frame->skb_std = NULL; - frame->skb_hsr = skb; - frame->sequence_nr = hsr_get_skb_sequence_nr(skb); - } else { - frame->skb_std = skb; - frame->skb_hsr = NULL; - /* Sequence nr for the master node */ - spin_lock_irqsave(&port->hsr->seqnr_lock, irqflags); - frame->sequence_nr = port->hsr->sequence_nr; - port->hsr->sequence_nr++; - spin_unlock_irqrestore(&port->hsr->seqnr_lock, irqflags); - } + frame->is_from_san = false; frame->port_rcv = port; + hsr->proto_ops->fill_frame_info(proto, skb, frame); check_local_dest(port->hsr, skb, frame); return 0; @@ -363,8 +534,9 @@ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) goto out_drop; } - if (hsr_fill_frame_info(&frame, skb, port) < 0) + if (fill_frame_info(&frame, skb, port) < 0) goto out_drop; + hsr_register_frame_in(frame.node_src, port, frame.sequence_nr); hsr_forward_do(&frame); /* Gets called for ingress frames as well as egress from master port. @@ -375,10 +547,9 @@ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) port->dev->stats.tx_bytes += skb->len; } - if (frame.skb_hsr) - kfree_skb(frame.skb_hsr); - if (frame.skb_std) - kfree_skb(frame.skb_std); + kfree_skb(frame.skb_hsr); + kfree_skb(frame.skb_prp); + kfree_skb(frame.skb_std); return; out_drop: diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h index 51a69295566c..618140d484ad 100644 --- a/net/hsr/hsr_forward.h +++ b/net/hsr/hsr_forward.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_FORWARD_H @@ -12,5 +14,17 @@ #include "hsr_main.h" void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port); - +struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port); +struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port); +struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port); +struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame, + struct hsr_port *port); +bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port); +void prp_fill_frame_info(__be16 proto, struct sk_buff *skb, + struct hsr_frame_info *frame); +void hsr_fill_frame_info(__be16 proto, struct sk_buff *skb, + struct hsr_frame_info *frame); #endif /* __HSR_FORWARD_H */ diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 530de24b1fb5..5c97de459905 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -8,6 +8,7 @@ * interface. A frame is identified by its source MAC address and its HSR * sequence number. This code keeps track of senders and their sequence numbers * to allow filtering of duplicate frames, and to detect HSR ring errors. + * Same code handles filtering of duplicates for PRP as well. */ #include <linux/if_ether.h> @@ -126,6 +127,19 @@ void hsr_del_nodes(struct list_head *node_db) kfree(node); } +void prp_handle_san_frame(bool san, enum hsr_port_type port, + struct hsr_node *node) +{ + /* Mark if the SAN node is over LAN_A or LAN_B */ + if (port == HSR_PT_SLAVE_A) { + node->san_a = true; + return; + } + + if (port == HSR_PT_SLAVE_B) + node->san_b = true; +} + /* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A; * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. @@ -133,7 +147,8 @@ void hsr_del_nodes(struct list_head *node_db) static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, struct list_head *node_db, unsigned char addr[], - u16 seq_out) + u16 seq_out, bool san, + enum hsr_port_type rx_port) { struct hsr_node *new_node, *node; unsigned long now; @@ -154,6 +169,9 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, for (i = 0; i < HSR_PT_PORTS; i++) new_node->seq_out[i] = seq_out; + if (san && hsr->proto_ops->handle_san_frame) + hsr->proto_ops->handle_san_frame(san, rx_port, new_node); + spin_lock_bh(&hsr->list_lock); list_for_each_entry_rcu(node, node_db, mac_list, lockdep_is_held(&hsr->list_lock)) { @@ -171,15 +189,26 @@ out: return node; } +void prp_update_san_info(struct hsr_node *node, bool is_sup) +{ + if (!is_sup) + return; + + node->san_a = false; + node->san_b = false; +} + /* Get the hsr_node from which 'skb' was sent. */ -struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, - bool is_sup) +struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, + struct sk_buff *skb, bool is_sup, + enum hsr_port_type rx_port) { - struct list_head *node_db = &port->hsr->node_db; struct hsr_priv *hsr = port->hsr; struct hsr_node *node; struct ethhdr *ethhdr; + struct prp_rct *rct; + bool san = false; u16 seq_out; if (!skb_mac_header_was_set(skb)) @@ -188,14 +217,21 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, ethhdr = (struct ethhdr *)skb_mac_header(skb); list_for_each_entry_rcu(node, node_db, mac_list) { - if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) + if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { + if (hsr->proto_ops->update_san_info) + hsr->proto_ops->update_san_info(node, is_sup); return node; - if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) + } + if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) { + if (hsr->proto_ops->update_san_info) + hsr->proto_ops->update_san_info(node, is_sup); return node; + } } - /* Everyone may create a node entry, connected node to a HSR device. */ - + /* Everyone may create a node entry, connected node to a HSR/PRP + * device. + */ if (ethhdr->h_proto == htons(ETH_P_PRP) || ethhdr->h_proto == htons(ETH_P_HSR)) { /* Use the existing sequence_nr from the tag as starting point @@ -203,31 +239,47 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, */ seq_out = hsr_get_skb_sequence_nr(skb) - 1; } else { - /* this is called also for frames from master port and - * so warn only for non master ports - */ - if (port->type != HSR_PT_MASTER) - WARN_ONCE(1, "%s: Non-HSR frame\n", __func__); - seq_out = HSR_SEQNR_START; + rct = skb_get_PRP_rct(skb); + if (rct && prp_check_lsdu_size(skb, rct, is_sup)) { + seq_out = prp_get_skb_sequence_nr(rct); + } else { + if (rx_port != HSR_PT_MASTER) + san = true; + seq_out = HSR_SEQNR_START; + } } - return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out); + return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out, + san, rx_port); } /* Use the Supervision frame's info about an eventual macaddress_B for merging * nodes that has previously had their macaddress_B registered as a separate * node. */ -void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, - struct hsr_port *port_rcv) +void hsr_handle_sup_frame(struct hsr_frame_info *frame) { + struct hsr_node *node_curr = frame->node_src; + struct hsr_port *port_rcv = frame->port_rcv; struct hsr_priv *hsr = port_rcv->hsr; struct hsr_sup_payload *hsr_sp; struct hsr_node *node_real; + struct sk_buff *skb = NULL; struct list_head *node_db; struct ethhdr *ethhdr; int i; + /* Here either frame->skb_hsr or frame->skb_prp should be + * valid as supervision frame always will have protocol + * header info. + */ + if (frame->skb_hsr) + skb = frame->skb_hsr; + else if (frame->skb_prp) + skb = frame->skb_prp; + if (!skb) + return; + ethhdr = (struct ethhdr *)skb_mac_header(skb); /* Leave the ethernet header. */ @@ -248,7 +300,8 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, if (!node_real) /* No frame received from AddrA of this node yet */ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, - HSR_SEQNR_START - 1); + HSR_SEQNR_START - 1, true, + port_rcv->type); if (!node_real) goto done; /* No mem */ if (node_real == node_curr) @@ -274,7 +327,11 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, kfree_rcu(node_curr, rcu_head); done: - skb_push(skb, sizeof(struct hsrv1_ethhdr_sp)); + /* PRP uses v0 header */ + if (ethhdr->h_proto == htons(ETH_P_HSR)) + skb_push(skb, sizeof(struct hsrv1_ethhdr_sp)); + else + skb_push(skb, sizeof(struct hsrv0_ethhdr_sp)); } /* 'skb' is a frame meant for this host, that is to be passed to upper layers. diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 0f0fa12b4329..86b43f539f2c 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_FRAMEREG_H @@ -12,12 +14,26 @@ struct hsr_node; +struct hsr_frame_info { + struct sk_buff *skb_std; + struct sk_buff *skb_hsr; + struct sk_buff *skb_prp; + struct hsr_port *port_rcv; + struct hsr_node *node_src; + u16 sequence_nr; + bool is_supervision; + bool is_vlan; + bool is_local_dest; + bool is_local_exclusive; + bool is_from_san; +}; + void hsr_del_self_node(struct hsr_priv *hsr); void hsr_del_nodes(struct list_head *node_db); -struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, - bool is_sup); -void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, - struct hsr_port *port); +struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, + struct sk_buff *skb, bool is_sup, + enum hsr_port_type rx_port); +void hsr_handle_sup_frame(struct hsr_frame_info *frame); bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr); void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb); @@ -47,6 +63,10 @@ int hsr_get_node_data(struct hsr_priv *hsr, int *if2_age, u16 *if2_seq); +void prp_handle_san_frame(bool san, enum hsr_port_type port, + struct hsr_node *node); +void prp_update_san_info(struct hsr_node *node, bool is_sup); + struct hsr_node { struct list_head mac_list; unsigned char macaddress_A[ETH_ALEN]; @@ -55,6 +75,9 @@ struct hsr_node { enum hsr_port_type addr_B_port; unsigned long time_in[HSR_PT_PORTS]; bool time_in_stale[HSR_PT_PORTS]; + /* if the node is a SAN */ + bool san_a; + bool san_b; u16 seq_out[HSR_PT_PORTS]; struct rcu_head rcu_head; }; diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 144da15f0a81..2fd1976e5b1c 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Event handling for HSR and PRP devices. */ #include <linux/netdevice.h> diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index f74193465bf5..7dc92ce5a134 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_PRIVATE_H @@ -10,6 +12,7 @@ #include <linux/netdevice.h> #include <linux/list.h> +#include <linux/if_vlan.h> /* Time constants as specified in the HSR specification (IEC-62439-3 2010) * Table 8. @@ -33,6 +36,10 @@ #define HSR_TLV_ANNOUNCE 22 #define HSR_TLV_LIFE_CHECK 23 +/* PRP V1 life check for Duplicate discard */ +#define PRP_TLV_LIFE_CHECK_DD 20 +/* PRP V1 life check for Duplicate Accept */ +#define PRP_TLV_LIFE_CHECK_DA 21 /* HSR Tag. * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB, @@ -80,7 +87,12 @@ struct hsr_ethhdr { struct hsr_tag hsr_tag; } __packed; -/* HSR Supervision Frame data types. +struct hsr_vlan_ethhdr { + struct vlan_ethhdr vlanhdr; + struct hsr_tag hsr_tag; +} __packed; + +/* HSR/PRP Supervision Frame data types. * Field names as defined in the IEC:2010 standard for HSR. */ struct hsr_sup_tag { @@ -124,6 +136,34 @@ enum hsr_port_type { HSR_PT_PORTS, /* This must be the last item in the enum */ }; +/* PRP Redunancy Control Trailor (RCT). + * As defined in IEC-62439-4:2012, the PRP RCT is really { sequence Nr, + * Lan indentifier (LanId), LSDU_size and PRP_suffix = 0x88FB }. + * + * Field names as defined in the IEC:2012 standard for PRP. + */ +struct prp_rct { + __be16 sequence_nr; + __be16 lan_id_and_LSDU_size; + __be16 PRP_suffix; +} __packed; + +static inline u16 get_prp_LSDU_size(struct prp_rct *rct) +{ + return ntohs(rct->lan_id_and_LSDU_size) & 0x0FFF; +} + +static inline void set_prp_lan_id(struct prp_rct *rct, u16 lan_id) +{ + rct->lan_id_and_LSDU_size = htons((ntohs(rct->lan_id_and_LSDU_size) & + 0x0FFF) | (lan_id << 12)); +} +static inline void set_prp_LSDU_size(struct prp_rct *rct, u16 LSDU_size) +{ + rct->lan_id_and_LSDU_size = htons((ntohs(rct->lan_id_and_LSDU_size) & + 0xF000) | (LSDU_size & 0x0FFF)); +} + struct hsr_port { struct list_head port_list; struct net_device *dev; @@ -131,6 +171,32 @@ struct hsr_port { enum hsr_port_type type; }; +/* used by driver internally to differentiate various protocols */ +enum hsr_version { + HSR_V0 = 0, + HSR_V1, + PRP_V1, +}; + +struct hsr_frame_info; +struct hsr_node; + +struct hsr_proto_ops { + /* format and send supervision frame */ + void (*send_sv_frame)(struct hsr_port *port, unsigned long *interval); + void (*handle_san_frame)(bool san, enum hsr_port_type port, + struct hsr_node *node); + bool (*drop_frame)(struct hsr_frame_info *frame, struct hsr_port *port); + struct sk_buff * (*get_untagged_frame)(struct hsr_frame_info *frame, + struct hsr_port *port); + struct sk_buff * (*create_tagged_frame)(struct hsr_frame_info *frame, + struct hsr_port *port); + void (*fill_frame_info)(__be16 proto, struct sk_buff *skb, + struct hsr_frame_info *frame); + bool (*invalid_dan_ingress_frame)(__be16 protocol); + void (*update_san_info)(struct hsr_node *node, bool is_sup); +}; + struct hsr_priv { struct rcu_head rcu_head; struct list_head ports; @@ -141,9 +207,16 @@ struct hsr_priv { int announce_count; u16 sequence_nr; u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ - u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ + enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */ spinlock_t seqnr_lock; /* locking for sequence_nr */ spinlock_t list_lock; /* locking for node list */ + struct hsr_proto_ops *proto_ops; +#define PRP_LAN_ID 0x5 /* 0x1010 for A and 0x1011 for B. Bit 0 is set + * based on SLAVE_A or SLAVE_B + */ + u8 net_id; /* for PRP, it occupies most significant 3 bits + * of lan_id + */ unsigned char sup_multicast_addr[ETH_ALEN]; #ifdef CONFIG_DEBUG_FS struct dentry *node_tbl_root; @@ -164,6 +237,49 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) return ntohs(hsr_ethhdr->hsr_tag.sequence_nr); } +static inline struct prp_rct *skb_get_PRP_rct(struct sk_buff *skb) +{ + unsigned char *tail = skb_tail_pointer(skb) - HSR_HLEN; + + struct prp_rct *rct = (struct prp_rct *)tail; + + if (rct->PRP_suffix == htons(ETH_P_PRP)) + return rct; + + return NULL; +} + +/* Assume caller has confirmed this skb is PRP suffixed */ +static inline u16 prp_get_skb_sequence_nr(struct prp_rct *rct) +{ + return ntohs(rct->sequence_nr); +} + +static inline u16 get_prp_lan_id(struct prp_rct *rct) +{ + return ntohs(rct->lan_id_and_LSDU_size) >> 12; +} + +/* assume there is a valid rct */ +static inline bool prp_check_lsdu_size(struct sk_buff *skb, + struct prp_rct *rct, + bool is_sup) +{ + struct ethhdr *ethhdr; + int expected_lsdu_size; + + if (is_sup) { + expected_lsdu_size = HSR_V1_SUP_LSDUSIZE; + } else { + ethhdr = (struct ethhdr *)skb_mac_header(skb); + expected_lsdu_size = skb->len - 14; + if (ethhdr->h_proto == htons(ETH_P_8021Q)) + expected_lsdu_size -= 4; + } + + return (expected_lsdu_size == get_prp_LSDU_size(rct)); +} + #if IS_ENABLED(CONFIG_DEBUG_FS) void hsr_debugfs_rename(struct net_device *dev); void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 6e14b7d22639..06c3cd988760 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -4,7 +4,7 @@ * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * - * Routines for handling Netlink messages for HSR. + * Routines for handling Netlink messages for HSR and PRP. */ #include "hsr_netlink.h" @@ -22,6 +22,7 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { [IFLA_HSR_VERSION] = { .type = NLA_U8 }, [IFLA_HSR_SUPERVISION_ADDR] = { .len = ETH_ALEN }, [IFLA_HSR_SEQ_NR] = { .type = NLA_U16 }, + [IFLA_HSR_PROTOCOL] = { .type = NLA_U8 }, }; /* Here, it seems a netdevice has already been allocated for us, and the @@ -31,8 +32,10 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { + enum hsr_version proto_version; + unsigned char multicast_spec; + u8 proto = HSR_PROTOCOL_HSR; struct net_device *link[2]; - unsigned char multicast_spec, hsr_version; if (!data) { NL_SET_ERR_MSG_MOD(extack, "No slave devices specified"); @@ -69,18 +72,34 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, else multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]); + if (data[IFLA_HSR_PROTOCOL]) + proto = nla_get_u8(data[IFLA_HSR_PROTOCOL]); + + if (proto >= HSR_PROTOCOL_MAX) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported protocol\n"); + return -EINVAL; + } + if (!data[IFLA_HSR_VERSION]) { - hsr_version = 0; + proto_version = HSR_V0; } else { - hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]); - if (hsr_version > 1) { + if (proto == HSR_PROTOCOL_PRP) { + NL_SET_ERR_MSG_MOD(extack, "PRP version unsupported\n"); + return -EINVAL; + } + + proto_version = nla_get_u8(data[IFLA_HSR_VERSION]); + if (proto_version > HSR_V1) { NL_SET_ERR_MSG_MOD(extack, - "Only versions 0..1 are supported"); + "Only HSR version 0/1 supported\n"); return -EINVAL; } } - return hsr_dev_finalize(dev, link, multicast_spec, hsr_version, extack); + if (proto == HSR_PROTOCOL_PRP) + proto_version = PRP_V1; + + return hsr_dev_finalize(dev, link, multicast_spec, proto_version, extack); } static void hsr_dellink(struct net_device *dev, struct list_head *head) @@ -102,6 +121,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); + u8 proto = HSR_PROTOCOL_HSR; struct hsr_port *port; port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); @@ -120,6 +140,10 @@ static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) hsr->sup_multicast_addr) || nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr)) goto nla_put_failure; + if (hsr->prot_version == PRP_V1) + proto = HSR_PROTOCOL_PRP; + if (nla_put_u8(skb, IFLA_HSR_PROTOCOL, proto)) + goto nla_put_failure; return 0; diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h index 1121bb192a18..501552d9753b 100644 --- a/net/hsr/hsr_netlink.h +++ b/net/hsr/hsr_netlink.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_NETLINK_H diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 25b6ffba26cd..36d5fcf09c61 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" @@ -14,12 +16,22 @@ #include "hsr_forward.h" #include "hsr_framereg.h" +bool hsr_invalid_dan_ingress_frame(__be16 protocol) +{ + return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)); +} + static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct hsr_port *port; + struct hsr_priv *hsr; __be16 protocol; + /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) + return RX_HANDLER_PASS; + if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: skb invalid", __func__); return RX_HANDLER_PASS; @@ -28,6 +40,7 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) port = hsr_port_get_rcu(skb->dev); if (!port) goto finish_pass; + hsr = port->hsr; if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { /* Directly kill frames sent by ourselves */ @@ -35,12 +48,23 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) goto finish_consume; } + /* For HSR, only tagged frames are expected, but for PRP + * there could be non tagged frames as well from Single + * attached nodes (SANs). + */ protocol = eth_hdr(skb)->h_proto; - if (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)) + if (hsr->proto_ops->invalid_dan_ingress_frame && + hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; skb_push(skb, ETH_HLEN); + if (skb_mac_header(skb) != skb->data) { + WARN_ONCE(1, "%s:%d: Malformed frame at source port %s)\n", + __func__, __LINE__, port->dev->name); + goto finish_consume; + } + hsr_forward_skb(skb, port); finish_consume: diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h index 8953ea279ce9..edc4612bb009 100644 --- a/net/hsr/hsr_slave.h +++ b/net/hsr/hsr_slave.h @@ -2,6 +2,8 @@ /* Copyright 2011-2014 Autronica Fire and Security AS * * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_SLAVE_H @@ -30,4 +32,6 @@ static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev) rcu_dereference(dev->rx_handler_data) : NULL; } +bool hsr_invalid_dan_ingress_frame(__be16 protocol); + #endif /* __HSR_SLAVE_H */ diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index d93d4531aa9b..a45a0401adc5 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -382,7 +382,7 @@ static int raw_getsockopt(struct sock *sk, int level, int optname, } static int raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { return -EOPNOTSUPP; } @@ -423,10 +423,6 @@ static const struct proto_ops ieee802154_raw_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; /* DGRAM Sockets (802.15.4 dataframes) */ @@ -876,7 +872,7 @@ static int dgram_getsockopt(struct sock *sk, int level, int optname, } static int dgram_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct dgram_sock *ro = dgram_sk(sk); struct net *net = sock_net(sk); @@ -886,7 +882,7 @@ static int dgram_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); @@ -986,10 +982,6 @@ static const struct proto_ops ieee802154_dgram_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; /* Create a socket. Initialise the socket, blank the addresses diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e64e59b536d3..60db5a6487cc 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -10,7 +10,7 @@ config IP_MULTICAST intend to participate in the MBONE, a high bandwidth network on top of the Internet which carries audio and video broadcasts. More information about the MBONE is on the WWW at - <http://www.savetz.com/mbone/>. For most people, it's safe to say N. + <https://www.savetz.com/mbone/>. For most people, it's safe to say N. config IP_ADVANCED_ROUTER bool "IP: advanced router" @@ -73,7 +73,7 @@ config IP_MULTIPLE_TABLES If you need more information, see the Linux Advanced Routing and Traffic Control documentation at - <http://lartc.org/howto/lartc.rpdb.html> + <https://lartc.org/howto/lartc.rpdb.html> If unsure, say N. @@ -280,7 +280,7 @@ config SYN_COOKIES continue to connect, even when your machine is under attack. There is no need for the legitimate users to change their TCP/IP software; SYN cookies work transparently to them. For technical information - about SYN cookies, check out <http://cr.yp.to/syncookies.html>. + about SYN cookies, check out <https://cr.yp.to/syncookies.html>. If you are SYN flooded, the source address reported by the kernel is likely to have been forged by the attacker; it is only reported as @@ -525,7 +525,7 @@ config TCP_CONG_HSTCP A modification to TCP's congestion control mechanism for use with large congestion windows. A table indicates how much to increase the congestion window by when an ACK is received. - For more detail see http://www.icir.org/floyd/hstcp.html + For more detail see https://www.icir.org/floyd/hstcp.html config TCP_CONG_HYBLA tristate "TCP-Hybla congestion control algorithm" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9e1a186a3671..5b77a46885b9 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -14,7 +14,7 @@ obj-y := route.o inetpeer.o protocol.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ - metrics.o netlink.o nexthop.o + metrics.o netlink.o nexthop.o udp_tunnel_stub.o obj-$(CONFIG_BPFILTER) += bpfilter/ @@ -29,6 +29,7 @@ gre-y := gre_demux.o obj-$(CONFIG_NET_FOU) += fou.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o +udp_tunnel-y := udp_tunnel_core.o udp_tunnel_nic.o obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o obj-$(CONFIG_NET_IPVTI) += ip_vti.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 02aa5cb3a4fd..4307503a6f0b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -411,6 +411,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + if (!sk->sk_kern_sock) + BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); + /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); @@ -1040,8 +1043,6 @@ const struct proto_ops inet_stream_ops = { .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif .set_rcvlowat = tcp_set_rcvlowat, @@ -1070,8 +1071,6 @@ const struct proto_ops inet_dgram_ops = { .sendpage = inet_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif }; @@ -1102,8 +1101,6 @@ static const struct proto_ops inet_sockraw_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif }; @@ -1432,10 +1429,6 @@ static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, return inet_gso_segment(skb, features); } -INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, - struct sk_buff *)); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; @@ -1608,8 +1601,6 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) return -EINVAL; } -INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); int inet_gro_complete(struct sk_buff *skb, int nhoff) { __be16 newlen = htons(skb->len - nhoff); diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 9063c6767d34..545b2640f019 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -21,8 +21,7 @@ void bpfilter_umh_cleanup(struct umd_info *info) } EXPORT_SYMBOL_GPL(bpfilter_umh_cleanup); -static int bpfilter_mbox_request(struct sock *sk, int optname, - char __user *optval, +static int bpfilter_mbox_request(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen, bool is_set) { int err; @@ -52,20 +51,23 @@ out: return err; } -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { return bpfilter_mbox_request(sk, optname, optval, optlen, true); } -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen) +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, + char __user *user_optval, int __user *optlen) { - int len; + sockptr_t optval; + int err, len; if (get_user(len, optlen)) return -EFAULT; - + err = init_user_sockptr(&optval, user_optval, len); + if (err) + return err; return bpfilter_mbox_request(sk, optname, optval, len, false); } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index a23094b050f8..2eb71579f4d2 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -10,9 +10,9 @@ * * The CIPSO draft specification can be found in the kernel's Documentation * directory as well as the following URL: - * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt + * https://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt * The FIPS-188 specification can be found at the following URL: - * http://www.itl.nist.gov/fipspubs/fip188.htm + * https://www.itl.nist.gov/fipspubs/fip188.htm * * Author: Paul Moore <paul.moore@hp.com> */ @@ -283,7 +283,7 @@ static int cipso_v4_cache_check(const unsigned char *key, /** * cipso_v4_cache_add - Add an entry to the CIPSO cache - * @skb: the packet + * @cipso_ptr: pointer to CIPSO IP option * @secattr: the packet's security attributes * * Description: @@ -1535,6 +1535,7 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) /** * cipso_v4_validate - Validate a CIPSO option + * @skb: the packet * @option: the start of the option, on error it is set to point to the error * * Description: @@ -2066,7 +2067,7 @@ void cipso_v4_sock_delattr(struct sock *sk) /** * cipso_v4_req_delattr - Delete the CIPSO option from a request socket - * @reg: the request socket + * @req: the request socket * * Description: * Removes the CIPSO option from a request socket, if present. @@ -2158,6 +2159,7 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) /** * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet * @skb: the packet + * @doi_def: the DOI structure * @secattr: the security attributes * * Description: diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f99e3bac5cab..ce54a30c2ef1 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -29,6 +29,7 @@ #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/fib_rules.h> +#include <linux/indirect_call_wrapper.h> struct fib4_rule { struct fib_rule common; @@ -103,8 +104,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, } EXPORT_SYMBOL_GPL(__fib_lookup); -static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, - int flags, struct fib_lookup_arg *arg) +INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule, + struct flowi *flp, int flags, + struct fib_lookup_arg *arg) { int err = -EAGAIN; struct fib_table *tbl; @@ -138,7 +140,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, return err; } -static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule, + struct fib_lookup_arg *arg) { struct fib_result *result = (struct fib_result *) arg->result; struct net_device *dev = NULL; @@ -169,7 +172,8 @@ suppress_route: return true; } -static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) +INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, + struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; struct flowi4 *fl4 = &fl->u.ip4; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3c65f71d0e82..c89b46fec153 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -13,7 +13,7 @@ * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - * http://www.csc.kth.se/~snilsson/software/dyntrie2/ + * https://www.csc.kth.se/~snilsson/software/dyntrie2/ * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 2e6d1b7a7bc9..e0a246575887 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -15,12 +15,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + bool need_csum, need_recompute_csum, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; - bool need_csum, gso_partial; if (!skb->encapsulation) goto out; @@ -41,6 +41,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, skb->protocol = skb->inner_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); + need_recompute_csum = skb->csum_not_inet; skb->encap_hdr_csum = need_csum; features &= skb->dev->hw_enc_features; @@ -98,7 +99,15 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } *(pcsum + 1) = 0; - *pcsum = gso_make_checksum(skb, 0); + if (need_recompute_csum && !skb_is_gso(skb)) { + __wsum csum; + + csum = skb_checksum(skb, gre_offset, + skb->len - gre_offset, 0); + *pcsum = csum_fold(csum); + } else { + *pcsum = gso_make_checksum(skb, 0); + } } while ((skb = skb->next)); out: return segs; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e30515f89802..cf36f955bfe6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1116,6 +1116,65 @@ error: goto drop; } +static bool ip_icmp_error_rfc4884_validate(const struct sk_buff *skb, int off) +{ + struct icmp_extobj_hdr *objh, _objh; + struct icmp_ext_hdr *exth, _exth; + u16 olen; + + exth = skb_header_pointer(skb, off, sizeof(_exth), &_exth); + if (!exth) + return false; + if (exth->version != 2) + return true; + + if (exth->checksum && + csum_fold(skb_checksum(skb, off, skb->len - off, 0))) + return false; + + off += sizeof(_exth); + while (off < skb->len) { + objh = skb_header_pointer(skb, off, sizeof(_objh), &_objh); + if (!objh) + return false; + + olen = ntohs(objh->length); + if (olen < sizeof(_objh)) + return false; + + off += olen; + if (off > skb->len) + return false; + } + + return true; +} + +void ip_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out, + int thlen, int off) +{ + int hlen; + + /* original datagram headers: end of icmph to payload (skb->data) */ + hlen = -skb_transport_offset(skb) - thlen; + + /* per rfc 4884: minimal datagram length of 128 bytes */ + if (off < 128 || off < hlen) + return; + + /* kernel has stripped headers: return payload offset in bytes */ + off -= hlen; + if (off + sizeof(struct icmp_ext_hdr) > skb->len) + return; + + out->len = off; + + if (!ip_icmp_error_rfc4884_validate(skb, off)) + out->flags |= SO_EE_RFC4884_FLAG_INVALID; +} +EXPORT_SYMBOL_GPL(ip_icmp_error_rfc4884); + int icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index afaf582a5aa9..d1a3913eebe0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -648,20 +648,19 @@ no_route: EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); /* Decide when to expire the request and when to resend SYN-ACK */ -static inline void syn_ack_recalc(struct request_sock *req, const int thresh, - const int max_retries, - const u8 rskq_defer_accept, - int *expire, int *resend) +static void syn_ack_recalc(struct request_sock *req, + const int max_syn_ack_retries, + const u8 rskq_defer_accept, + int *expire, int *resend) { if (!rskq_defer_accept) { - *expire = req->num_timeout >= thresh; + *expire = req->num_timeout >= max_syn_ack_retries; *resend = 1; return; } - *expire = req->num_timeout >= thresh && - (!inet_rsk(req)->acked || req->num_timeout >= max_retries); - /* - * Do not resend while waiting for data after ACK, + *expire = req->num_timeout >= max_syn_ack_retries && + (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept); + /* Do not resend while waiting for data after ACK, * start to resend on end of deferring period to give * last chance for data or ACK to create established socket. */ @@ -720,15 +719,12 @@ static void reqsk_timer_handler(struct timer_list *t) struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - int qlen, expire = 0, resend = 0; - int max_retries, thresh; - u8 defer_accept; + int max_syn_ack_retries, qlen, expire = 0, resend = 0; if (inet_sk_state_load(sk_listener) != TCP_LISTEN) goto drop; - max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; - thresh = max_retries; + max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means @@ -750,17 +746,14 @@ static void reqsk_timer_handler(struct timer_list *t) if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; - while (thresh > 2) { + while (max_syn_ack_retries > 2) { if (qlen < young) break; - thresh--; + max_syn_ack_retries--; young <<= 1; } } - defer_accept = READ_ONCE(queue->rskq_defer_accept); - if (defer_accept) - max_retries = defer_accept; - syn_ack_recalc(req, thresh, max_retries, defer_accept, + syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept), &expire, &resend); req->rsk_ops->syn_ack_timeout(req); if (!expire && @@ -1064,34 +1057,6 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) } EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); -#ifdef CONFIG_COMPAT -int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - - if (icsk->icsk_af_ops->compat_getsockopt) - return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname, - optval, optlen); - return icsk->icsk_af_ops->getsockopt(sk, level, optname, - optval, optlen); -} -EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); - -int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - - if (icsk->icsk_af_ops->compat_setsockopt) - return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname, - optval, optlen); - return icsk->icsk_af_ops->setsockopt(sk, level, optname, - optval, optlen); -} -EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); -#endif - static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 125f4f8a36b4..4a98dd736270 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -52,6 +52,11 @@ static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { + if (proto < 0 || proto >= IPPROTO_MAX) { + mutex_lock(&inet_diag_table_mutex); + return ERR_PTR(-ENOENT); + } + if (!inet_diag_table[proto]) sock_load_diag_module(AF_INET, proto); @@ -181,6 +186,28 @@ errout: } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); +static void inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr **req_nlas) +{ + struct nlattr *nla; + int remaining; + + nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) { + int type = nla_type(nla); + + if (type < __INET_DIAG_REQ_MAX) + req_nlas[type] = nla; + } +} + +static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req, + const struct inet_diag_dump_data *data) +{ + if (data->req_nlas[INET_DIAG_REQ_PROTOCOL]) + return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]); + return req->sdiag_protocol; +} + #define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, @@ -198,7 +225,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, void *info = NULL; cb_data = cb->data; - handler = inet_diag_table[req->sdiag_protocol]; + handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)]; BUG_ON(!handler); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -539,20 +566,25 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, + int hdrlen, const struct inet_diag_req_v2 *req) { const struct inet_diag_handler *handler; - int err; + struct inet_diag_dump_data dump_data; + int err, protocol; - handler = inet_diag_lock_handler(req->sdiag_protocol); + memset(&dump_data, 0, sizeof(dump_data)); + inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas); + protocol = inet_diag_get_protocol(req, &dump_data); + + handler = inet_diag_lock_handler(protocol); if (IS_ERR(handler)) { err = PTR_ERR(handler); } else if (cmd == SOCK_DIAG_BY_FAMILY) { - struct inet_diag_dump_data empty_dump_data = {}; struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, - .data = &empty_dump_data, + .data = &dump_data, }; err = handler->dump_one(&cb, req); } else if (cmd == SOCK_DESTROY && handler->destroy) { @@ -1103,13 +1135,16 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { + struct inet_diag_dump_data *cb_data = cb->data; const struct inet_diag_handler *handler; u32 prev_min_dump_alloc; - int err = 0; + int protocol, err = 0; + + protocol = inet_diag_get_protocol(r, cb_data); again: prev_min_dump_alloc = cb->min_dump_alloc; - handler = inet_diag_lock_handler(r->sdiag_protocol); + handler = inet_diag_lock_handler(protocol); if (!IS_ERR(handler)) handler->dump(skb, cb, r); else @@ -1139,19 +1174,13 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) struct inet_diag_dump_data *cb_data; struct sk_buff *skb = cb->skb; struct nlattr *nla; - int rem, err; + int err; cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); if (!cb_data) return -ENOMEM; - nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), rem) { - int type = nla_type(nla); - - if (type < __INET_DIAG_REQ_MAX) - cb_data->req_nlas[type] = nla; - } + inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas); nla = cb_data->inet_diag_nla_bc; if (nla) { @@ -1237,7 +1266,8 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, req.idiag_states = rc->idiag_states; req.id = rc->id; - return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req); + return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, + sizeof(struct inet_diag_req), &req); } static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -1279,7 +1309,8 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) return netlink_dump_start(net->diag_nlsk, skb, h, &c); } - return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h)); + return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen, + nlmsg_data(h)); } static diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2bbaaf0c7176..4eb4cd8d20dd 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -246,6 +246,21 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } +static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum) +{ + struct sock *reuse_sk = NULL; + u32 phash; + + if (sk->sk_reuseport) { + phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + reuse_sk = reuseport_select_sock(sk, phash, skb, doff); + } + return reuse_sk; +} + /* * Here are some nice properties to exploit here. The BSD API * does not allow a listening sock to specify the remote port nor the @@ -265,21 +280,17 @@ static struct sock *inet_lhash2_lookup(struct net *net, struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; int score, hiscore = 0; - u32 phash = 0; inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { sk = (struct sock *)icsk; score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - if (sk->sk_reuseport) { - phash = inet_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, phash, - skb, doff); - if (result) - return result; - } + result = lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum); + if (result) + return result; + result = sk; hiscore = score; } @@ -288,6 +299,29 @@ static struct sock *inet_lhash2_lookup(struct net *net, return result; } +static inline struct sock *inet_lookup_run_bpf(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (hashinfo != &tcp_hashinfo) + return NULL; /* only TCP is supported */ + + no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, + saddr, sport, daddr, hnum, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -299,6 +333,14 @@ struct sock *__inet_lookup_listener(struct net *net, struct sock *result = NULL; unsigned int hash2; + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + result = inet_lookup_run_bpf(net, hashinfo, skb, doff, + saddr, sport, daddr, hnum); + if (result) + goto done; + } + hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ddaa01ec2bce..948747aac4e2 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -519,15 +519,20 @@ void ip_options_undo(struct ip_options *opt) } } -static struct ip_options_rcu *ip_options_get_alloc(const int optlen) +int ip_options_get(struct net *net, struct ip_options_rcu **optp, + sockptr_t data, int optlen) { - return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), + struct ip_options_rcu *opt; + + opt = kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), GFP_KERNEL); -} + if (!opt) + return -ENOMEM; + if (optlen && copy_from_sockptr(opt->opt.__data, data, optlen)) { + kfree(opt); + return -EFAULT; + } -static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp, - struct ip_options_rcu *opt, int optlen) -{ while (optlen & 3) opt->opt.__data[optlen++] = IPOPT_END; opt->opt.optlen = optlen; @@ -540,32 +545,6 @@ static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp, return 0; } -int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp, - unsigned char __user *data, int optlen) -{ - struct ip_options_rcu *opt = ip_options_get_alloc(optlen); - - if (!opt) - return -ENOMEM; - if (optlen && copy_from_user(opt->opt.__data, data, optlen)) { - kfree(opt); - return -EFAULT; - } - return ip_options_get_finish(net, optp, opt, optlen); -} - -int ip_options_get(struct net *net, struct ip_options_rcu **optp, - unsigned char *data, int optlen) -{ - struct ip_options_rcu *opt = ip_options_get_alloc(optlen); - - if (!opt) - return -ENOMEM; - if (optlen) - memcpy(opt->opt.__data, data, optlen); - return ip_options_get_finish(net, optp, opt, optlen); -} - void ip_forward_options(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 17206677d503..61f802d5350c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -539,6 +539,12 @@ no_route: } EXPORT_SYMBOL(__ip_queue_xmit); +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +{ + return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); +} +EXPORT_SYMBOL(ip_queue_xmit); + static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { to->pkt_type = from->pkt_type; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 84ec3703c909..d2c223554ff7 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -280,7 +280,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, err = cmsg->cmsg_len - sizeof(struct cmsghdr); /* Our caller is responsible for freeing ipc->opt */ - err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), + err = ip_options_get(net, &ipc->opt, + KERNEL_SOCKPTR(CMSG_DATA(cmsg)), err < 40 ? err : 40); if (err) return err; @@ -389,6 +390,18 @@ int ip_ra_control(struct sock *sk, unsigned char on, return 0; } +static void ipv4_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out) +{ + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + ip_icmp_error_rfc4884(skb, out, sizeof(struct icmphdr), + icmp_hdr(skb)->un.reserved[1] * 4); + } +} + void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { @@ -411,6 +424,9 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; if (skb_pull(skb, payload - skb->data)) { + if (inet_sk(sk)->recverr_rfc4884) + ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb) == 0) return; @@ -679,20 +695,48 @@ Eaddrnotavail: return -EADDRNOTAVAIL; } +static int copy_group_source_from_sockptr(struct group_source_req *greqs, + sockptr_t optval, int optlen) +{ + if (in_compat_syscall()) { + struct compat_group_source_req gr32; + + if (optlen != sizeof(gr32)) + return -EINVAL; + if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) + return -EFAULT; + greqs->gsr_interface = gr32.gsr_interface; + greqs->gsr_group = gr32.gsr_group; + greqs->gsr_source = gr32.gsr_source; + } else { + if (optlen != sizeof(*greqs)) + return -EINVAL; + if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) + return -EFAULT; + } + + return 0; +} + static int do_mcast_group_source(struct sock *sk, int optname, - struct group_source_req *greqs) + sockptr_t optval, int optlen) { + struct group_source_req greqs; struct ip_mreq_source mreqs; struct sockaddr_in *psin; int omode, add, err; - if (greqs->gsr_group.ss_family != AF_INET || - greqs->gsr_source.ss_family != AF_INET) + err = copy_group_source_from_sockptr(&greqs, optval, optlen); + if (err) + return err; + + if (greqs.gsr_group.ss_family != AF_INET || + greqs.gsr_source.ss_family != AF_INET) return -EADDRNOTAVAIL; - psin = (struct sockaddr_in *)&greqs->gsr_group; + psin = (struct sockaddr_in *)&greqs.gsr_group; mreqs.imr_multiaddr = psin->sin_addr.s_addr; - psin = (struct sockaddr_in *)&greqs->gsr_source; + psin = (struct sockaddr_in *)&greqs.gsr_source; mreqs.imr_sourceaddr = psin->sin_addr.s_addr; mreqs.imr_interface = 0; /* use index for mc_source */ @@ -705,25 +749,145 @@ static int do_mcast_group_source(struct sock *sk, int optname, } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct ip_mreqn mreq; - psin = (struct sockaddr_in *)&greqs->gsr_group; + psin = (struct sockaddr_in *)&greqs.gsr_group; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; - mreq.imr_ifindex = greqs->gsr_interface; + mreq.imr_ifindex = greqs.gsr_interface; err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) return err; - greqs->gsr_interface = mreq.imr_ifindex; + greqs.gsr_interface = mreq.imr_ifindex; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } - return ip_mc_source(add, omode, sk, &mreqs, greqs->gsr_interface); + return ip_mc_source(add, omode, sk, &mreqs, greqs.gsr_interface); +} + +static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) +{ + struct group_filter *gsf = NULL; + int err; + + if (optlen < GROUP_FILTER_SIZE(0)) + return -EINVAL; + if (optlen > sysctl_optmem_max) + return -ENOBUFS; + + gsf = memdup_sockptr(optval, optlen); + if (IS_ERR(gsf)) + return PTR_ERR(gsf); + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + err = -ENOBUFS; + if (gsf->gf_numsrc >= 0x1ffffff || + gsf->gf_numsrc > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + goto out_free_gsf; + + err = -EINVAL; + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) + goto out_free_gsf; + + err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc, + gsf->gf_fmode, &gsf->gf_group, gsf->gf_slist); +out_free_gsf: + kfree(gsf); + return err; +} + +static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, + int optlen) +{ + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter *gf32; + unsigned int n; + void *p; + int err; + + if (optlen < size0) + return -EINVAL; + if (optlen > sysctl_optmem_max - 4) + return -ENOBUFS; + + p = kmalloc(optlen + 4, GFP_KERNEL); + if (!p) + return -ENOMEM; + gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + + err = -EFAULT; + if (copy_from_sockptr(gf32, optval, optlen)) + goto out_free_gsf; + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + n = gf32->gf_numsrc; + err = -ENOBUFS; + if (n >= 0x1ffffff) + goto out_free_gsf; + + err = -EINVAL; + if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) + goto out_free_gsf; + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + err = -ENOBUFS; + if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + goto out_free_gsf; + err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode, + &gf32->gf_group, gf32->gf_slist); +out_free_gsf: + kfree(p); + return err; } -static int do_ip_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, unsigned int optlen) +static int ip_mcast_join_leave(struct sock *sk, int optname, + sockptr_t optval, int optlen) +{ + struct ip_mreqn mreq = { }; + struct sockaddr_in *psin; + struct group_req greq; + + if (optlen < sizeof(struct group_req)) + return -EINVAL; + if (copy_from_sockptr(&greq, optval, sizeof(greq))) + return -EFAULT; + + psin = (struct sockaddr_in *)&greq.gr_group; + if (psin->sin_family != AF_INET) + return -EINVAL; + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + if (optname == MCAST_JOIN_GROUP) + return ip_mc_join_group(sk, &mreq); + return ip_mc_leave_group(sk, &mreq); +} + +static int compat_ip_mcast_join_leave(struct sock *sk, int optname, + sockptr_t optval, int optlen) +{ + struct compat_group_req greq; + struct ip_mreqn mreq = { }; + struct sockaddr_in *psin; + + if (optlen < sizeof(struct compat_group_req)) + return -EINVAL; + if (copy_from_sockptr(&greq, optval, sizeof(greq))) + return -EFAULT; + + psin = (struct sockaddr_in *)&greq.gr_group; + if (psin->sin_family != AF_INET) + return -EINVAL; + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + + if (optname == MCAST_JOIN_GROUP) + return ip_mc_join_group(sk, &mreq); + return ip_mc_leave_group(sk, &mreq); +} + +static int do_ip_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -755,13 +919,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_RECVORIGDSTADDR: case IP_CHECKSUM: case IP_RECVFRAGSIZE: + case IP_RECVERR_RFC4884: if (optlen >= sizeof(int)) { - if (get_user(val, (int __user *) optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else if (optlen >= sizeof(char)) { unsigned char ucval; - if (get_user(ucval, (unsigned char __user *) optval)) + if (copy_from_sockptr(&ucval, optval, sizeof(ucval))) return -EFAULT; val = (int) ucval; } @@ -786,8 +951,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (optlen > 40) goto e_inval; - err = ip_options_get_from_user(sock_net(sk), &opt, - optval, optlen); + err = ip_options_get(sock_net(sk), &opt, optval, optlen); if (err) break; old = rcu_dereference_protected(inet->inet_opt, @@ -914,6 +1078,11 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (!val) skb_queue_purge(&sk->sk_error_queue); break; + case IP_RECVERR_RFC4884: + if (val < 0 || val > 1) + goto e_inval; + inet->recverr_rfc4884 = !!val; + break; case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) goto e_inval; @@ -980,17 +1149,17 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { - if (copy_from_user(&mreq, optval, sizeof(mreq))) + if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); if (optlen >= sizeof(struct ip_mreq)) { - if (copy_from_user(&mreq, optval, - sizeof(struct ip_mreq))) + if (copy_from_sockptr(&mreq, optval, + sizeof(struct ip_mreq))) break; } else if (optlen >= sizeof(struct in_addr)) { - if (copy_from_user(&mreq.imr_address, optval, - sizeof(struct in_addr))) + if (copy_from_sockptr(&mreq.imr_address, optval, + sizeof(struct in_addr))) break; } } @@ -1042,11 +1211,12 @@ static int do_ip_setsockopt(struct sock *sk, int level, goto e_inval; err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { - if (copy_from_user(&mreq, optval, sizeof(mreq))) + if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); - if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq))) + if (copy_from_sockptr(&mreq, optval, + sizeof(struct ip_mreq))) break; } @@ -1066,7 +1236,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -ENOBUFS; break; } - msf = memdup_user(optval, optlen); + msf = memdup_sockptr(optval, optlen); if (IS_ERR(msf)) { err = PTR_ERR(msf); break; @@ -1097,7 +1267,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (optlen != sizeof(struct ip_mreq_source)) goto e_inval; - if (copy_from_user(&mreqs, optval, sizeof(mreqs))) { + if (copy_from_sockptr(&mreqs, optval, sizeof(mreqs))) { err = -EFAULT; break; } @@ -1127,77 +1297,24 @@ static int do_ip_setsockopt(struct sock *sk, int level, } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: - { - struct group_req greq; - struct sockaddr_in *psin; - struct ip_mreqn mreq; - - if (optlen < sizeof(struct group_req)) - goto e_inval; - err = -EFAULT; - if (copy_from_user(&greq, optval, sizeof(greq))) - break; - psin = (struct sockaddr_in *)&greq.gr_group; - if (psin->sin_family != AF_INET) - goto e_inval; - memset(&mreq, 0, sizeof(mreq)); - mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_ifindex = greq.gr_interface; - - if (optname == MCAST_JOIN_GROUP) - err = ip_mc_join_group(sk, &mreq); + if (in_compat_syscall()) + err = compat_ip_mcast_join_leave(sk, optname, optval, + optlen); else - err = ip_mc_leave_group(sk, &mreq); + err = ip_mcast_join_leave(sk, optname, optval, optlen); break; - } case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: - { - struct group_source_req greqs; - - if (optlen != sizeof(struct group_source_req)) - goto e_inval; - if (copy_from_user(&greqs, optval, sizeof(greqs))) { - err = -EFAULT; - break; - } - err = do_mcast_group_source(sk, optname, &greqs); + err = do_mcast_group_source(sk, optname, optval, optlen); break; - } case MCAST_MSFILTER: - { - struct group_filter *gsf = NULL; - - if (optlen < GROUP_FILTER_SIZE(0)) - goto e_inval; - if (optlen > sysctl_optmem_max) { - err = -ENOBUFS; - break; - } - gsf = memdup_user(optval, optlen); - if (IS_ERR(gsf)) { - err = PTR_ERR(gsf); - break; - } - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (gsf->gf_numsrc >= 0x1ffffff || - gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) { - err = -ENOBUFS; - goto mc_msf_out; - } - if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { - err = -EINVAL; - goto mc_msf_out; - } - err = set_mcast_msfilter(sk, gsf->gf_interface, - gsf->gf_numsrc, gsf->gf_fmode, - &gsf->gf_group, gsf->gf_slist); -mc_msf_out: - kfree(gsf); + if (in_compat_syscall()) + err = compat_ip_set_mcast_msfilter(sk, optval, optlen); + else + err = ip_set_mcast_msfilter(sk, optval, optlen); break; - } case IP_MULTICAST_ALL: if (optlen < 1) goto e_inval; @@ -1296,8 +1413,8 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); } -int ip_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, unsigned int optlen) +int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { int err; @@ -1322,138 +1439,6 @@ int ip_setsockopt(struct sock *sk, int level, } EXPORT_SYMBOL(ip_setsockopt); -#ifdef CONFIG_COMPAT -int compat_ip_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - int err; - - if (level != SOL_IP) - return -ENOPROTOOPT; - - switch (optname) { - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - { - struct compat_group_req __user *gr32 = (void __user *)optval; - struct group_req greq; - struct sockaddr_in *psin = (struct sockaddr_in *)&greq.gr_group; - struct ip_mreqn mreq; - - if (optlen < sizeof(struct compat_group_req)) - return -EINVAL; - - if (get_user(greq.gr_interface, &gr32->gr_interface) || - copy_from_user(&greq.gr_group, &gr32->gr_group, - sizeof(greq.gr_group))) - return -EFAULT; - - if (psin->sin_family != AF_INET) - return -EINVAL; - - memset(&mreq, 0, sizeof(mreq)); - mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_ifindex = greq.gr_interface; - - rtnl_lock(); - lock_sock(sk); - if (optname == MCAST_JOIN_GROUP) - err = ip_mc_join_group(sk, &mreq); - else - err = ip_mc_leave_group(sk, &mreq); - release_sock(sk); - rtnl_unlock(); - return err; - } - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - { - struct compat_group_source_req __user *gsr32 = (void __user *)optval; - struct group_source_req greqs; - - if (optlen != sizeof(struct compat_group_source_req)) - return -EINVAL; - - if (get_user(greqs.gsr_interface, &gsr32->gsr_interface) || - copy_from_user(&greqs.gsr_group, &gsr32->gsr_group, - sizeof(greqs.gsr_group)) || - copy_from_user(&greqs.gsr_source, &gsr32->gsr_source, - sizeof(greqs.gsr_source))) - return -EFAULT; - - rtnl_lock(); - lock_sock(sk); - err = do_mcast_group_source(sk, optname, &greqs); - release_sock(sk); - rtnl_unlock(); - return err; - } - case MCAST_MSFILTER: - { - const int size0 = offsetof(struct compat_group_filter, gf_slist); - struct compat_group_filter *gf32; - unsigned int n; - void *p; - - if (optlen < size0) - return -EINVAL; - if (optlen > sysctl_optmem_max - 4) - return -ENOBUFS; - - p = kmalloc(optlen + 4, GFP_KERNEL); - if (!p) - return -ENOMEM; - gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ - if (copy_from_user(gf32, optval, optlen)) { - err = -EFAULT; - goto mc_msf_out; - } - - n = gf32->gf_numsrc; - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (n >= 0x1ffffff) { - err = -ENOBUFS; - goto mc_msf_out; - } - if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) { - err = -EINVAL; - goto mc_msf_out; - } - - rtnl_lock(); - lock_sock(sk); - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) - err = -ENOBUFS; - else - err = set_mcast_msfilter(sk, gf32->gf_interface, - n, gf32->gf_fmode, - &gf32->gf_group, gf32->gf_slist); - release_sock(sk); - rtnl_unlock(); -mc_msf_out: - kfree(p); - return err; - } - } - - err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#ifdef CONFIG_NETFILTER - /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_HDRINCL && - optname != IP_IPSEC_POLICY && - optname != IP_XFRM_POLICY && - !ip_mroute_opt(optname)) - err = compat_nf_setsockopt(sk, PF_INET, optname, optval, - optlen); -#endif - return err; -} -EXPORT_SYMBOL(compat_ip_setsockopt); -#endif - /* * Get the options. Note for future reference. The GET of IP options gets * the _received_ ones. The set sets the _sent_ ones. @@ -1469,8 +1454,67 @@ static bool getsockopt_needs_rtnl(int optname) return false; } +static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval, + int __user *optlen, int len) +{ + const int size0 = offsetof(struct group_filter, gf_slist); + struct group_filter __user *p = optval; + struct group_filter gsf; + int num; + int err; + + if (len < size0) + return -EINVAL; + if (copy_from_user(&gsf, p, size0)) + return -EFAULT; + + num = gsf.gf_numsrc; + err = ip_mc_gsfget(sk, &gsf, p->gf_slist); + if (err) + return err; + if (gsf.gf_numsrc < num) + num = gsf.gf_numsrc; + if (put_user(GROUP_FILTER_SIZE(num), optlen) || + copy_to_user(p, &gsf, size0)) + return -EFAULT; + return 0; +} + +static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, + int __user *optlen, int len) +{ + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter __user *p = optval; + struct compat_group_filter gf32; + struct group_filter gf; + int num; + int err; + + if (len < size0) + return -EINVAL; + if (copy_from_user(&gf32, p, size0)) + return -EFAULT; + + gf.gf_interface = gf32.gf_interface; + gf.gf_fmode = gf32.gf_fmode; + num = gf.gf_numsrc = gf32.gf_numsrc; + gf.gf_group = gf32.gf_group; + + err = ip_mc_gsfget(sk, &gf, p->gf_slist); + if (err) + return err; + if (gf.gf_numsrc < num) + num = gf.gf_numsrc; + len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); + if (put_user(len, optlen) || + put_user(gf.gf_fmode, &p->gf_fmode) || + put_user(gf.gf_numsrc, &p->gf_numsrc)) + return -EFAULT; + return 0; +} + static int do_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen, unsigned int flags) + char __user *optval, int __user *optlen) { struct inet_sock *inet = inet_sk(sk); bool needs_rtnl = getsockopt_needs_rtnl(optname); @@ -1588,6 +1632,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_RECVERR: val = inet->recverr; break; + case IP_RECVERR_RFC4884: + val = inet->recverr_rfc4884; + break; case IP_MULTICAST_TTL: val = inet->mc_ttl; break; @@ -1627,31 +1674,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, goto out; } case MCAST_MSFILTER: - { - struct group_filter __user *p = (void __user *)optval; - struct group_filter gsf; - const int size0 = offsetof(struct group_filter, gf_slist); - int num; - - if (len < size0) { - err = -EINVAL; - goto out; - } - if (copy_from_user(&gsf, p, size0)) { - err = -EFAULT; - goto out; - } - num = gsf.gf_numsrc; - err = ip_mc_gsfget(sk, &gsf, p->gf_slist); - if (err) - goto out; - if (gsf.gf_numsrc < num) - num = gsf.gf_numsrc; - if (put_user(GROUP_FILTER_SIZE(num), optlen) || - copy_to_user(p, &gsf, size0)) - err = -EFAULT; + if (in_compat_syscall()) + err = compat_ip_get_mcast_msfilter(sk, optval, optlen, + len); + else + err = ip_get_mcast_msfilter(sk, optval, optlen, len); goto out; - } case IP_MULTICAST_ALL: val = inet->mc_all; break; @@ -1667,7 +1695,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, msg.msg_control_is_user = true; msg.msg_control_user = optval; msg.msg_controllen = len; - msg.msg_flags = flags; + msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; if (inet->cmsg_flags & IP_CMSG_PKTINFO) { struct in_pktinfo info; @@ -1731,7 +1759,8 @@ int ip_getsockopt(struct sock *sk, int level, { int err; - err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); + err = do_ip_getsockopt(sk, level, optname, optval, optlen); + #if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) @@ -1755,79 +1784,3 @@ int ip_getsockopt(struct sock *sk, int level, return err; } EXPORT_SYMBOL(ip_getsockopt); - -#ifdef CONFIG_COMPAT -int compat_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - int err; - - if (optname == MCAST_MSFILTER) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); - struct compat_group_filter __user *p = (void __user *)optval; - struct compat_group_filter gf32; - struct group_filter gf; - int ulen, err; - int num; - - if (level != SOL_IP) - return -EOPNOTSUPP; - - if (get_user(ulen, optlen)) - return -EFAULT; - - if (ulen < size0) - return -EINVAL; - - if (copy_from_user(&gf32, p, size0)) - return -EFAULT; - - gf.gf_interface = gf32.gf_interface; - gf.gf_fmode = gf32.gf_fmode; - num = gf.gf_numsrc = gf32.gf_numsrc; - gf.gf_group = gf32.gf_group; - - rtnl_lock(); - lock_sock(sk); - err = ip_mc_gsfget(sk, &gf, p->gf_slist); - release_sock(sk); - rtnl_unlock(); - if (err) - return err; - if (gf.gf_numsrc < num) - num = gf.gf_numsrc; - ulen = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); - if (put_user(ulen, optlen) || - put_user(gf.gf_fmode, &p->gf_fmode) || - put_user(gf.gf_numsrc, &p->gf_numsrc)) - return -EFAULT; - return 0; - } - - err = do_ip_getsockopt(sk, level, optname, optval, optlen, - MSG_CMSG_COMPAT); - -#if IS_ENABLED(CONFIG_BPFILTER_UMH) - if (optname >= BPFILTER_IPT_SO_GET_INFO && - optname < BPFILTER_IPT_GET_MAX) - err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); -#endif -#ifdef CONFIG_NETFILTER - /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && - !ip_mroute_opt(optname)) { - int len; - - if (get_user(len, optlen)) - return -EFAULT; - - err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len); - if (err >= 0) - err = put_user(len, optlen); - return err; - } -#endif - return err; -} -EXPORT_SYMBOL(compat_ip_getsockopt); -#endif diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index f8b419e2475c..75c6013ff9a4 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -25,6 +25,7 @@ #include <net/protocol.h> #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> +#include <net/ip6_checksum.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> @@ -184,6 +185,250 @@ int iptunnel_handle_offloads(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); +/** + * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD + * @skb: Original packet with L2 header + * @mtu: MTU value for ICMP error + * + * Return: length on success, negative error code if message couldn't be built. + */ +static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) +{ + const struct iphdr *iph = ip_hdr(skb); + struct icmphdr *icmph; + struct iphdr *niph; + struct ethhdr eh; + int len, err; + + if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) + return -EINVAL; + + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); + pskb_pull(skb, ETH_HLEN); + skb_reset_network_header(skb); + + err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph)); + if (err) + return err; + + len = skb->len + sizeof(*icmph); + err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN); + if (err) + return err; + + icmph = skb_push(skb, sizeof(*icmph)); + *icmph = (struct icmphdr) { + .type = ICMP_DEST_UNREACH, + .code = ICMP_FRAG_NEEDED, + .checksum = 0, + .un.frag.__unused = 0, + .un.frag.mtu = ntohs(mtu), + }; + icmph->checksum = ip_compute_csum(icmph, len); + skb_reset_transport_header(skb); + + niph = skb_push(skb, sizeof(*niph)); + *niph = (struct iphdr) { + .ihl = sizeof(*niph) / 4u, + .version = 4, + .tos = 0, + .tot_len = htons(len + sizeof(*niph)), + .id = 0, + .frag_off = htons(IP_DF), + .ttl = iph->ttl, + .protocol = IPPROTO_ICMP, + .saddr = iph->daddr, + .daddr = iph->saddr, + }; + ip_send_check(niph); + skb_reset_network_header(skb); + + skb->ip_summed = CHECKSUM_NONE; + + eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0); + skb_reset_mac_header(skb); + + return skb->len; +} + +/** + * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed + * @skb: Buffer being sent by encapsulation, L2 headers expected + * @mtu: Network MTU for path + * + * Return: 0 for no ICMP reply, length if built, negative value on error. + */ +static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) +{ + const struct icmphdr *icmph = icmp_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); + + if (mtu <= 576 || iph->frag_off != htons(IP_DF)) + return 0; + + if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) || + ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr) || + ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr)) + return 0; + + if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type)) + return 0; + + return iptunnel_pmtud_build_icmp(skb, mtu); +} + +#if IS_ENABLED(CONFIG_IPV6) +/** + * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD + * @skb: Original packet with L2 header + * @mtu: MTU value for ICMPv6 error + * + * Return: length on success, negative error code if message couldn't be built. + */ +static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct icmp6hdr *icmp6h; + struct ipv6hdr *nip6h; + struct ethhdr eh; + int len, err; + __wsum csum; + + if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) + return -EINVAL; + + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); + pskb_pull(skb, ETH_HLEN); + skb_reset_network_header(skb); + + err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h)); + if (err) + return err; + + len = skb->len + sizeof(*icmp6h); + err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN); + if (err) + return err; + + icmp6h = skb_push(skb, sizeof(*icmp6h)); + *icmp6h = (struct icmp6hdr) { + .icmp6_type = ICMPV6_PKT_TOOBIG, + .icmp6_code = 0, + .icmp6_cksum = 0, + .icmp6_mtu = htonl(mtu), + }; + skb_reset_transport_header(skb); + + nip6h = skb_push(skb, sizeof(*nip6h)); + *nip6h = (struct ipv6hdr) { + .priority = 0, + .version = 6, + .flow_lbl = { 0 }, + .payload_len = htons(len), + .nexthdr = IPPROTO_ICMPV6, + .hop_limit = ip6h->hop_limit, + .saddr = ip6h->daddr, + .daddr = ip6h->saddr, + }; + skb_reset_network_header(skb); + + csum = csum_partial(icmp6h, len, 0); + icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len, + IPPROTO_ICMPV6, csum); + + skb->ip_summed = CHECKSUM_NONE; + + eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0); + skb_reset_mac_header(skb); + + return skb->len; +} + +/** + * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed + * @skb: Buffer being sent by encapsulation, L2 headers expected + * @mtu: Network MTU for path + * + * Return: 0 for no ICMPv6 reply, length if built, negative value on error. + */ +static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + int stype = ipv6_addr_type(&ip6h->saddr); + u8 proto = ip6h->nexthdr; + __be16 frag_off; + int offset; + + if (mtu <= IPV6_MIN_MTU) + return 0; + + if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST || + stype == IPV6_ADDR_LOOPBACK) + return 0; + + offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, + &frag_off); + if (offset < 0 || (frag_off & htons(~0x7))) + return 0; + + if (proto == IPPROTO_ICMPV6) { + struct icmp6hdr *icmp6h; + + if (!pskb_may_pull(skb, skb_network_header(skb) + + offset + 1 - skb->data)) + return 0; + + icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset); + if (icmpv6_is_err(icmp6h->icmp6_type) || + icmp6h->icmp6_type == NDISC_REDIRECT) + return 0; + } + + return iptunnel_pmtud_build_icmpv6(skb, mtu); +} +#endif /* IS_ENABLED(CONFIG_IPV6) */ + +/** + * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed + * @skb: Buffer being sent by encapsulation, L2 headers expected + * @encap_dst: Destination for tunnel encapsulation (outer IP) + * @headroom: Encapsulation header size, bytes + * @reply: Build matching ICMP or ICMPv6 message as a result + * + * L2 tunnel implementations that can carry IP and can be directly bridged + * (currently UDP tunnels) can't always rely on IP forwarding paths to handle + * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built + * based on payload and sent back by the encapsulation itself. + * + * For routable interfaces, we just need to update the PMTU for the destination. + * + * Return: 0 if ICMP error not needed, length if built, negative value on error + */ +int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, + int headroom, bool reply) +{ + u32 mtu = dst_mtu(encap_dst) - headroom; + + if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) || + (!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu)) + return 0; + + skb_dst_update_pmtu_no_confirm(skb, mtu); + + if (!reply || skb->pkt_type == PACKET_HOST) + return 0; + + if (skb->protocol == htons(ETH_P_IP)) + return iptunnel_pmtud_check_icmp(skb, mtu); + +#if IS_ENABLED(CONFIG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) + return iptunnel_pmtud_check_icmpv6(skb, mtu); +#endif + return 0; +} +EXPORT_SYMBOL(skb_tunnel_check_pmtu); + /* Often modified stats are per cpu, other are shared (netdev->stats) */ void ip_tunnel_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *tot) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 460ca1099e8a..49daaed89764 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -91,32 +91,6 @@ static int vti_rcv_proto(struct sk_buff *skb) return vti_rcv(skb, 0, false); } -static int vti_rcv_tunnel(struct sk_buff *skb) -{ - struct ip_tunnel_net *itn = net_generic(dev_net(skb->dev), vti_net_id); - const struct iphdr *iph = ip_hdr(skb); - struct ip_tunnel *tunnel; - - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->saddr, iph->daddr, 0); - if (tunnel) { - struct tnl_ptk_info tpi = { - .proto = htons(ETH_P_IP), - }; - - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto, false)) - goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, false); - } - - return -EINVAL; -drop: - kfree_skb(skb); - return 0; -} - static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -244,12 +218,15 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, } dst_hold(dst); - dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0); + dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { dev->stats.tx_carrier_errors++; goto tx_error_icmp; } + if (dst->flags & DST_XFRM_QUEUE) + goto queued; + if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { dev->stats.tx_carrier_errors++; dst_release(dst); @@ -281,6 +258,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } +queued: skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; @@ -496,11 +474,29 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .priority = 100, }; -static struct xfrm_tunnel ipip_handler __read_mostly = { +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +static int vti_rcv_tunnel(struct sk_buff *skb) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false); +} + +static struct xfrm_tunnel vti_ipip_handler __read_mostly = { + .handler = vti_rcv_tunnel, + .cb_handler = vti_rcv_cb, + .err_handler = vti4_err, + .priority = 0, +}; + +static struct xfrm_tunnel vti_ipip6_handler __read_mostly = { .handler = vti_rcv_tunnel, + .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; +#endif static int __net_init vti_init_net(struct net *net) { @@ -670,10 +666,17 @@ static int __init vti_init(void) if (err < 0) goto xfrm_proto_comp_failed; +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) msg = "ipip tunnel"; - err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET); + if (err < 0) + goto xfrm_tunnel_ipip_failed; +#if IS_ENABLED(CONFIG_IPV6) + err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6); if (err < 0) - goto xfrm_tunnel_failed; + goto xfrm_tunnel_ipip6_failed; +#endif +#endif msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); @@ -683,8 +686,14 @@ static int __init vti_init(void) return err; rtnl_link_failed: - xfrm4_tunnel_deregister(&ipip_handler, AF_INET); -xfrm_tunnel_failed: +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +#if IS_ENABLED(CONFIG_IPV6) + xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); +xfrm_tunnel_ipip6_failed: +#endif + xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); +xfrm_tunnel_ipip_failed: +#endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); @@ -700,7 +709,12 @@ pernet_dev_failed: static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); - xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +#if IS_ENABLED(CONFIG_IPV6) + xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); +#endif + xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); +#endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 59bfa3825810..b42683212c65 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -72,6 +72,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t->props.flags = x->props.flags; t->props.extra_flags = x->props.extra_flags; memcpy(&t->mark, &x->mark, sizeof(t->mark)); + t->if_id = x->if_id; if (xfrm_init_state(t)) goto error; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index f5c7a58844a4..876fd6ff1ff9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -636,7 +636,10 @@ static int call_ipmr_mfc_entry_notifiers(struct net *net, /** * vif_delete - Delete a VIF entry + * @mrt: Table to delete from + * @vifi: VIF identifier to delete * @notify: Set to 1, if the caller is a notifier_call + * @head: if unregistering the VIF, place it on this queue */ static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) @@ -1338,7 +1341,7 @@ static void mrtsock_destruct(struct sock *sk) * MOSPF/PIM router set up we can clean this up. */ -int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, +int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { struct net *net = sock_net(sk); @@ -1410,7 +1413,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (copy_from_user(&vif, optval, sizeof(vif))) { + if (copy_from_sockptr(&vif, optval, sizeof(vif))) { ret = -EFAULT; break; } @@ -1438,7 +1441,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (copy_from_user(&mfc, optval, sizeof(mfc))) { + if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) { ret = -EFAULT; break; } @@ -1456,7 +1459,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(val, (int __user *)optval)) { + if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } @@ -1468,7 +1471,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(val, (int __user *)optval)) { + if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } @@ -1483,7 +1486,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(val, (int __user *)optval)) { + if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } @@ -1505,7 +1508,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, ret = -EINVAL; break; } - if (get_user(uval, (u32 __user *)optval)) { + if (copy_from_sockptr(&uval, optval, sizeof(uval))) { ret = -EFAULT; break; } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b167f4a5b684..d1e04d2b5170 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -787,8 +787,7 @@ static int compat_table_info(const struct xt_table_info *info, } #endif -static int get_info(struct net *net, void __user *user, - const int *len, int compat) +static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -802,7 +801,7 @@ static int get_info(struct net *net, void __user *user, name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_lock(NFPROTO_ARP); #endif t = xt_request_find_table_lock(net, NFPROTO_ARP, name); @@ -812,7 +811,7 @@ static int get_info(struct net *net, void __user *user, #ifdef CONFIG_COMPAT struct xt_table_info tmp; - if (compat) { + if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(NFPROTO_ARP); private = &tmp; @@ -837,7 +836,7 @@ static int get_info(struct net *net, void __user *user, } else ret = PTR_ERR(t); #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_unlock(NFPROTO_ARP); #endif return ret; @@ -948,8 +947,7 @@ static int __do_replace(struct net *net, const char *name, return ret; } -static int do_replace(struct net *net, const void __user *user, - unsigned int len) +static int do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct arpt_replace tmp; @@ -957,7 +955,7 @@ static int do_replace(struct net *net, const void __user *user, void *loc_cpu_entry; struct arpt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -973,8 +971,8 @@ static int do_replace(struct net *net, const void __user *user, return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -997,8 +995,7 @@ static int do_replace(struct net *net, const void __user *user, return ret; } -static int do_add_counters(struct net *net, const void __user *user, - unsigned int len, int compat) +static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; @@ -1009,7 +1006,7 @@ static int do_add_counters(struct net *net, const void __user *user, struct arpt_entry *iter; unsigned int addend; - paddc = xt_copy_counters_from_user(user, len, &tmp, compat); + paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); @@ -1246,8 +1243,7 @@ out_unlock: return ret; } -static int compat_do_replace(struct net *net, void __user *user, - unsigned int len) +static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_arpt_replace tmp; @@ -1255,7 +1251,7 @@ static int compat_do_replace(struct net *net, void __user *user, void *loc_cpu_entry; struct arpt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1271,7 +1267,8 @@ static int compat_do_replace(struct net *net, void __user *user, return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1294,30 +1291,6 @@ static int compat_do_replace(struct net *net, void __user *user, return ret; } -static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, - unsigned int len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case ARPT_SO_SET_REPLACE: - ret = compat_do_replace(sock_net(sk), user, len); - break; - - case ARPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 1); - break; - - default: - ret = -EINVAL; - } - - return ret; -} - static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, compat_uint_t *size, struct xt_counters *counters, @@ -1425,32 +1398,10 @@ static int compat_get_entries(struct net *net, xt_compat_unlock(NFPROTO_ARP); return ret; } - -static int do_arpt_get_ctl(struct sock *, int, void __user *, int *); - -static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, - int *len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case ARPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 1); - break; - case ARPT_SO_GET_ENTRIES: - ret = compat_get_entries(sock_net(sk), user, len); - break; - default: - ret = do_arpt_get_ctl(sk, cmd, user, len); - } - return ret; -} #endif -static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, + unsigned int len) { int ret; @@ -1459,11 +1410,16 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned switch (cmd) { case ARPT_SO_SET_REPLACE: - ret = do_replace(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_do_replace(sock_net(sk), arg, len); + else +#endif + ret = do_replace(sock_net(sk), arg, len); break; case ARPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 0); + ret = do_add_counters(sock_net(sk), arg, len); break; default: @@ -1482,11 +1438,16 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len switch (cmd) { case ARPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 0); + ret = get_info(sock_net(sk), user, len); break; case ARPT_SO_GET_ENTRIES: - ret = get_entries(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_get_entries(sock_net(sk), user, len); + else +#endif + ret = get_entries(sock_net(sk), user, len); break; case ARPT_SO_GET_REVISION_TARGET: { @@ -1610,15 +1571,9 @@ static struct nf_sockopt_ops arpt_sockopts = { .set_optmin = ARPT_BASE_CTL, .set_optmax = ARPT_SO_SET_MAX+1, .set = do_arpt_set_ctl, -#ifdef CONFIG_COMPAT - .compat_set = compat_do_arpt_set_ctl, -#endif .get_optmin = ARPT_BASE_CTL, .get_optmax = ARPT_SO_GET_MAX+1, .get = do_arpt_get_ctl, -#ifdef CONFIG_COMPAT - .compat_get = compat_do_arpt_get_ctl, -#endif .owner = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 5bf9fa06aee0..f15bc21d7301 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -944,8 +944,7 @@ static int compat_table_info(const struct xt_table_info *info, } #endif -static int get_info(struct net *net, void __user *user, - const int *len, int compat) +static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -959,7 +958,7 @@ static int get_info(struct net *net, void __user *user, name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_lock(AF_INET); #endif t = xt_request_find_table_lock(net, AF_INET, name); @@ -969,7 +968,7 @@ static int get_info(struct net *net, void __user *user, #ifdef CONFIG_COMPAT struct xt_table_info tmp; - if (compat) { + if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET); private = &tmp; @@ -995,7 +994,7 @@ static int get_info(struct net *net, void __user *user, } else ret = PTR_ERR(t); #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_unlock(AF_INET); #endif return ret; @@ -1103,7 +1102,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, } static int -do_replace(struct net *net, const void __user *user, unsigned int len) +do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct ipt_replace tmp; @@ -1111,7 +1110,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1127,8 +1126,8 @@ do_replace(struct net *net, const void __user *user, unsigned int len) return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1152,8 +1151,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) } static int -do_add_counters(struct net *net, const void __user *user, - unsigned int len, int compat) +do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; @@ -1164,7 +1162,7 @@ do_add_counters(struct net *net, const void __user *user, struct ipt_entry *iter; unsigned int addend; - paddc = xt_copy_counters_from_user(user, len, &tmp, compat); + paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); @@ -1486,7 +1484,7 @@ out_unlock: } static int -compat_do_replace(struct net *net, void __user *user, unsigned int len) +compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_ipt_replace tmp; @@ -1494,7 +1492,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1510,8 +1508,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1534,31 +1532,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return ret; } -static int -compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, - unsigned int len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case IPT_SO_SET_REPLACE: - ret = compat_do_replace(sock_net(sk), user, len); - break; - - case IPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 1); - break; - - default: - ret = -EINVAL; - } - - return ret; -} - struct compat_ipt_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; @@ -1634,33 +1607,10 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_unlock(AF_INET); return ret; } - -static int do_ipt_get_ctl(struct sock *, int, void __user *, int *); - -static int -compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case IPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 1); - break; - case IPT_SO_GET_ENTRIES: - ret = compat_get_entries(sock_net(sk), user, len); - break; - default: - ret = do_ipt_get_ctl(sk, cmd, user, len); - } - return ret; -} #endif static int -do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) { int ret; @@ -1669,11 +1619,16 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) switch (cmd) { case IPT_SO_SET_REPLACE: - ret = do_replace(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_do_replace(sock_net(sk), arg, len); + else +#endif + ret = do_replace(sock_net(sk), arg, len); break; case IPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 0); + ret = do_add_counters(sock_net(sk), arg, len); break; default: @@ -1693,11 +1648,16 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) switch (cmd) { case IPT_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 0); + ret = get_info(sock_net(sk), user, len); break; case IPT_SO_GET_ENTRIES: - ret = get_entries(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_get_entries(sock_net(sk), user, len); + else +#endif + ret = get_entries(sock_net(sk), user, len); break; case IPT_SO_GET_REVISION_MATCH: @@ -1886,15 +1846,9 @@ static struct nf_sockopt_ops ipt_sockopts = { .set_optmin = IPT_BASE_CTL, .set_optmax = IPT_SO_SET_MAX+1, .set = do_ipt_set_ctl, -#ifdef CONFIG_COMPAT - .compat_set = compat_do_ipt_set_ctl, -#endif .get_optmin = IPT_BASE_CTL, .get_optmax = IPT_SO_GET_MAX+1, .get = do_ipt_get_ctl, -#ifdef CONFIG_COMPAT - .compat_get = compat_do_ipt_get_ctl, -#endif .owner = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index f8755a4ae9d4..a8b980ad11d4 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -3,7 +3,7 @@ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> * based on ideas of Fabio Olive Leite <olive@unixforge.org> * - * Development of this code funded by SuSE Linux AG, http://www.suse.com/ + * Development of this code funded by SuSE Linux AG, https://www.suse.com/ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 2361fdac2c43..9dcfa4e461b6 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -96,6 +96,21 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, } EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); +static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) +{ + struct dst_entry *dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(struct flowi)); + fl.u.ip4.daddr = ip_hdr(skb_in)->saddr; + nf_ip_route(dev_net(skb_in->dev), &dst, &fl, false); + if (!dst) + return -1; + + skb_dst_set(skb_in, dst); + return 0; +} + /* Send RST reply */ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) { @@ -109,6 +124,9 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) if (!oth) return; + if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(oldskb)) + return; + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) return; @@ -175,6 +193,9 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) if (iph->frag_off & htons(IP_OFFSET)) return; + if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(skb_in)) + return; + if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) { icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); return; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 75545a829a2b..1074df726ec0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -292,6 +292,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), + SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 47665919048f..6fd4330287c2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -809,11 +809,11 @@ static int raw_sk_init(struct sock *sk) return 0; } -static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen) +static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen) { if (optlen > sizeof(struct icmp_filter)) optlen = sizeof(struct icmp_filter); - if (copy_from_user(&raw_sk(sk)->filter, optval, optlen)) + if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; } @@ -838,7 +838,7 @@ out: return ret; } static int do_raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) @@ -850,23 +850,13 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname, } static int raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { if (level != SOL_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); return do_raw_setsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -static int compat_raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level != SOL_RAW) - return compat_ip_setsockopt(sk, level, optname, optval, optlen); - return do_raw_setsockopt(sk, level, optname, optval, optlen); -} -#endif - static int do_raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -887,16 +877,6 @@ static int raw_getsockopt(struct sock *sk, int level, int optname, return do_raw_getsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -static int compat_raw_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_RAW) - return compat_ip_getsockopt(sk, level, optname, optval, optlen); - return do_raw_getsockopt(sk, level, optname, optval, optlen); -} -#endif - static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) { switch (cmd) { @@ -980,8 +960,6 @@ struct proto raw_prot = { .usersize = sizeof_field(struct raw_sock, filter), .h.raw_hash = &raw_v4_hashinfo, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_raw_setsockopt, - .compat_getsockopt = compat_raw_getsockopt, .compat_ioctl = compat_raw_ioctl, #endif .diag_destroy = raw_abort, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a01efa062f6b..8ca6bcab7b03 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1050,6 +1050,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); + + /* Don't make lookup fail for bridged encapsulations */ + if (skb && netif_is_any_bridge_port(skb->dev)) + fl4.flowi4_oif = 0; + __ip_rt_update_pmtu(rt, &fl4, mtu); } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 9a4f6b16c9bc..f0794f0232ba 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -212,6 +212,12 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, refcount_set(&req->rsk_refcnt, 1); tcp_sk(child)->tsoffset = tsoff; sock_rps_save_rxhash(child, skb); + + if (rsk_drop_req(req)) { + refcount_set(&req->rsk_refcnt, 2); + return child; + } + if (inet_csk_reqsk_queue_add(sk, req, child)) return child; @@ -276,6 +282,40 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_ecn_ok); +struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk, + struct sk_buff *skb) +{ + struct request_sock *req; + +#ifdef CONFIG_MPTCP + struct tcp_request_sock *treq; + + if (sk_is_mptcp(sk)) + ops = &mptcp_subflow_request_sock_ops; +#endif + + req = inet_reqsk_alloc(ops, sk, false); + if (!req) + return NULL; + +#if IS_ENABLED(CONFIG_MPTCP) + treq = tcp_rsk(req); + treq->is_mptcp = sk_is_mptcp(sk); + if (treq->is_mptcp) { + int err = mptcp_subflow_init_cookie_req(req, sk, skb); + + if (err) { + reqsk_free(req); + return NULL; + } + } +#endif + + return req; +} +EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc); + /* On input, sk is a listener. * Output is listener if incoming packet would not create a child * NULL if memory could not be allocated. @@ -326,7 +366,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ + req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb); if (!req) goto out; @@ -350,9 +390,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq->snt_synack = 0; treq->tfo_listener = false; - if (IS_ENABLED(CONFIG_MPTCP)) - treq->is_mptcp = 0; - if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6f0caf9a866d..c06d2bfd2ec4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2764,7 +2764,7 @@ static inline bool tcp_can_repair_sock(const struct sock *sk) (sk->sk_state != TCP_LISTEN); } -static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) +static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) { struct tcp_repair_window opt; @@ -2774,7 +2774,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l if (len != sizeof(opt)) return -EINVAL; - if (copy_from_user(&opt, optbuf, sizeof(opt))) + if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) return -EFAULT; if (opt.max_window < opt.snd_wnd) @@ -2796,17 +2796,18 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l return 0; } -static int tcp_repair_options_est(struct sock *sk, - struct tcp_repair_opt __user *optbuf, unsigned int len) +static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, + unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; + size_t offset = 0; while (len >= sizeof(opt)) { - if (copy_from_user(&opt, optbuf, sizeof(opt))) + if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) return -EFAULT; - optbuf++; + offset += sizeof(opt); len -= sizeof(opt); switch (opt.opt_code) { @@ -2960,7 +2961,7 @@ void tcp_sock_set_user_timeout(struct sock *sk, u32 val) } EXPORT_SYMBOL(tcp_sock_set_user_timeout); -static int __tcp_sock_set_keepidle(struct sock *sk, int val) +int tcp_sock_set_keepidle_locked(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); @@ -2987,7 +2988,7 @@ int tcp_sock_set_keepidle(struct sock *sk, int val) int err; lock_sock(sk); - err = __tcp_sock_set_keepidle(sk, val); + err = tcp_sock_set_keepidle_locked(sk, val); release_sock(sk); return err; } @@ -3020,8 +3021,8 @@ EXPORT_SYMBOL(tcp_sock_set_keepcnt); /* * Socket option code for TCP. */ -static int do_tcp_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, unsigned int optlen) +static int do_tcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -3037,7 +3038,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (optlen < 1) return -EINVAL; - val = strncpy_from_user(name, optval, + val = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX-1, optlen)); if (val < 0) return -EFAULT; @@ -3056,7 +3057,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (optlen < 1) return -EINVAL; - val = strncpy_from_user(name, optval, + val = strncpy_from_sockptr(name, optval, min_t(long, TCP_ULP_NAME_MAX - 1, optlen)); if (val < 0) @@ -3079,7 +3080,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; - if (copy_from_user(key, optval, optlen)) + if (copy_from_sockptr(key, optval, optlen)) return -EFAULT; if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) @@ -3095,7 +3096,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3174,9 +3175,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED) - err = tcp_repair_options_est(sk, - (struct tcp_repair_opt __user *)optval, - optlen); + err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; break; @@ -3186,7 +3185,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_KEEPIDLE: - err = __tcp_sock_set_keepidle(sk, val); + err = tcp_sock_set_keepidle_locked(sk, val); break; case TCP_KEEPINTVL: if (val < 1 || val > MAX_TCP_KEEPINTVL) @@ -3325,7 +3324,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, return err; } -int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, +int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3337,18 +3336,6 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, } EXPORT_SYMBOL(tcp_setsockopt); -#ifdef CONFIG_COMPAT -int compat_tcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level != SOL_TCP) - return inet_csk_compat_setsockopt(sk, level, optname, - optval, optlen); - return do_tcp_setsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_tcp_setsockopt); -#endif - static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) { @@ -3514,10 +3501,12 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ 0; } -struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, + const struct sk_buff *orig_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; @@ -3571,6 +3560,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); + nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, + TCP_NLA_PAD); return stats; } @@ -3896,18 +3887,6 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, } EXPORT_SYMBOL(tcp_getsockopt); -#ifdef CONFIG_COMPAT -int compat_tcp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_TCP) - return inet_csk_compat_getsockopt(sk, level, optname, - optval, optlen); - return do_tcp_getsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_tcp_getsockopt); -#endif - #ifdef CONFIG_TCP_MD5SIG static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool); static DEFINE_MUTEX(tcp_md5sig_mutex); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index bfdfbb972c57..349069d6cd0a 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -2,7 +2,7 @@ /* * Sally Floyd's High Speed TCP (RFC 3649) congestion control * - * See http://www.icir.org/floyd/hstcp.html + * See https://www.icir.org/floyd/hstcp.html * * John Heffner <jheffner@psc.edu> */ diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 88e1f011afe0..55adcfcf96fe 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -4,7 +4,7 @@ * R.N.Shorten, D.J.Leith: * "H-TCP: TCP for high-speed and long-distance networks" * Proc. PFLDnet, Argonne, 2004. - * http://www.hamilton.ie/net/htcp3.pdf + * https://www.hamilton.ie/net/htcp3.pdf */ #include <linux/mm.h> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 518f04355fbf..184ea556f50e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -518,7 +518,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. - * <http://public.lanl.gov/radiant/pubs.html#DRS> + * <https://public.lanl.gov/radiant/pubs.html#DRS> * * More detail on this code can be found at * <http://staff.psc.edu/jheffner/>, @@ -871,12 +871,41 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } +struct tcp_sacktag_state { + /* Timestamps for earliest and latest never-retransmitted segment + * that was SACKed. RTO needs the earliest RTT to stay conservative, + * but congestion control should still get an accurate delay signal. + */ + u64 first_sackt; + u64 last_sackt; + u32 reord; + u32 sack_delivered; + int flag; + unsigned int mss_now; + struct rate_sample *rate; +}; + /* Take a notice that peer is sending D-SACKs */ -static void tcp_dsack_seen(struct tcp_sock *tp) +static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, + u32 end_seq, struct tcp_sacktag_state *state) { + u32 seq_len, dup_segs = 1; + + if (before(start_seq, end_seq)) { + seq_len = end_seq - start_seq; + if (seq_len > tp->mss_cache) + dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); + } + tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; tp->rack.dsack_seen = 1; - tp->dsack_dups++; + tp->dsack_dups += dup_segs; + + state->flag |= FLAG_DSACKING_ACK; + /* A spurious retransmission is delivered */ + state->sack_delivered += dup_segs; + + return dup_segs; } /* It's reordering when higher sequence was delivered (i.e. sacked) before @@ -962,6 +991,15 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) } } +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, + bool ece_ack) +{ + tp->delivered += delivered; + if (ece_ack) + tp->delivered_ce += delivered; +} + /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -1094,52 +1132,38 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, - u32 prior_snd_una) + u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); - bool dup_sack = false; + u32 dup_segs; if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { - dup_sack = true; - tcp_dsack_seen(tp); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); - if (!after(end_seq_0, end_seq_1) && - !before(start_seq_0, start_seq_1)) { - dup_sack = true; - tcp_dsack_seen(tp); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPDSACKOFORECV); - } + if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) + return false; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); + } else { + return false; } + dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); + /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && + if (tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) - tp->undo_retrans--; + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); - return dup_sack; + return true; } -struct tcp_sacktag_state { - u32 reord; - /* Timestamps for earliest and latest never-retransmitted segment - * that was SACKed. RTO needs the earliest RTT to stay conservative, - * but congestion control should still get an accurate delay signal. - */ - u64 first_sackt; - u64 last_sackt; - struct rate_sample *rate; - int flag; - unsigned int mss_now; -}; - /* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment @@ -1258,7 +1282,8 @@ static u8 tcp_sacktag_one(struct sock *sk, sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; - tp->delivered += pcount; /* Out-of-order packets delivered */ + /* Out-of-order packets delivered */ + state->sack_delivered += pcount; /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (tp->lost_skb_hint && @@ -1681,11 +1706,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, tcp_highest_sack_reset(sk); found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, - num_sacks, prior_snd_una); - if (found_dup_sack) { - state->flag |= FLAG_DSACKING_ACK; - tp->delivered++; /* A spurious retransmission is delivered */ - } + num_sacks, prior_snd_una, state); /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1893,7 +1914,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct sock *sk, int num_dupack) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) { if (num_dupack) { struct tcp_sock *tp = tcp_sk(sk); @@ -1904,20 +1925,21 @@ static void tcp_add_reno_sack(struct sock *sk, int num_dupack) tcp_check_reno_reordering(sk, 0); delivered = tp->sacked_out - prior_sacked; if (delivered > 0) - tp->delivered += delivered; + tcp_count_delivered(tp, delivered, ece_ack); tcp_verify_left_out(tp); } } /* Account for ACK, ACKing some data in Reno Recovery phase. */ -static void tcp_remove_reno_sacks(struct sock *sk, int acked) +static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ - tp->delivered += max_t(int, acked - tp->sacked_out, 1); + tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), + ece_ack); if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else @@ -2697,7 +2719,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, * delivered. Lower inflight to clock out (re)tranmissions. */ if (after(tp->snd_nxt, tp->high_seq) && num_dupack) - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } @@ -2779,6 +2801,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; + bool ece_ack = flag & FLAG_ECE; bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && tcp_force_fast_retransmit(sk)); @@ -2787,7 +2810,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ - if (flag & FLAG_ECE) + if (ece_ack) tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ @@ -2828,7 +2851,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, ece_ack); } else { if (tcp_try_undo_partial(sk, prior_snd_una)) return; @@ -2853,7 +2876,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, ece_ack); } if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -2877,7 +2900,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, } /* Otherwise enter Recovery state */ - tcp_enter_recovery(sk, (flag & FLAG_ECE)); + tcp_enter_recovery(sk, ece_ack); fast_rexmit = 1; } @@ -2927,6 +2950,8 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + if (!delta) + delta = 1; seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); ca_rtt_us = seq_rtt_us; } @@ -3053,7 +3078,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, u32 prior_snd_una, - struct tcp_sacktag_state *sack) + struct tcp_sacktag_state *sack, bool ece_ack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; @@ -3078,8 +3103,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, u8 sacked = scb->sacked; u32 acked_pcount; - tcp_ack_tstamp(sk, skb, prior_snd_una); - /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || @@ -3114,7 +3137,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; } else if (tcp_is_sack(tp)) { - tp->delivered += acked_pcount; + tcp_count_delivered(tp, acked_pcount, ece_ack); if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, tcp_skb_timestamp_us(skb)); @@ -3143,6 +3166,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (!fully_acked) break; + tcp_ack_tstamp(sk, skb, prior_snd_una); + next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; @@ -3158,8 +3183,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) tp->snd_up = tp->snd_una; - if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) - flag |= FLAG_SACK_RENEGING; + if (skb) { + tcp_ack_tstamp(sk, skb, prior_snd_una); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + flag |= FLAG_SACK_RENEGING; + } if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); @@ -3191,7 +3219,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, } if (tcp_is_reno(tp)) { - tcp_remove_reno_sacks(sk, pkts_acked); + tcp_remove_reno_sacks(sk, pkts_acked, ece_ack); /* If any of the cumulatively ACKed segments was * retransmitted, non-SACK case cannot confirm that @@ -3559,10 +3587,9 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) { - tp->delivered_ce += delivered; + if (flag & FLAG_ECE) NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); - } + return delivered; } @@ -3586,6 +3613,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sack_state.first_sackt = 0; sack_state.rate = &rs; + sack_state.sack_delivered = 0; /* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node); @@ -3661,6 +3689,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ack_ev_flags |= CA_ACK_ECE; } + if (sack_state.sack_delivered) + tcp_count_delivered(tp, sack_state.sack_delivered, + flag & FLAG_ECE); + if (flag & FLAG_WIN_UPDATE) ack_ev_flags |= CA_ACK_WIN_UPDATE; @@ -3686,7 +3718,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, + flag & FLAG_ECE); tcp_rack_update_reo_wnd(sk, &rs); @@ -4427,7 +4460,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket - * @dest: destination queue * @to: prior buffer * @from: buffer to add in queue * @fragstolen: pointer to boolean @@ -6489,7 +6521,6 @@ static void tcp_openreq_init(struct request_sock *req, struct inet_request_sock *ireq = inet_rsk(req); req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ - req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; @@ -6644,6 +6675,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!req) goto drop; + req->syncookie = want_cookie; tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; #if IS_ENABLED(CONFIG_MPTCP) @@ -6671,9 +6703,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, af_ops->init_req(req, sk, skb); - if (IS_ENABLED(CONFIG_MPTCP) && want_cookie) - tcp_rsk(req)->is_mptcp = 0; - if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; @@ -6709,7 +6738,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 04bfcbbfee83..5084333b5ab6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -76,6 +76,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/inetdevice.h> +#include <linux/btf_ids.h> #include <crypto/hash.h> #include <linux/scatterlist.h> @@ -1194,7 +1195,7 @@ static void tcp_clear_md5_list(struct sock *sk) } static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, - char __user *optval, int optlen) + sockptr_t optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; @@ -1205,7 +1206,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, if (optlen < sizeof(cmd)) return -EINVAL; - if (copy_from_user(&cmd, optval, sizeof(cmd))) + if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin->sin_family != AF_INET) @@ -2134,10 +2135,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ip_setsockopt, - .compat_getsockopt = compat_ip_getsockopt, -#endif .mtu_reduced = tcp_v4_mtu_reduced, }; EXPORT_SYMBOL(ipv4_specific); @@ -2223,13 +2220,18 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); */ static void *listening_get_next(struct seq_file *seq, void *cur) { - struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; struct sock *sk = cur; + if (st->bpf_seq_afinfo) + afinfo = st->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + if (!sk) { get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; @@ -2247,7 +2249,8 @@ get_sk: sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_family == afinfo->family) + if (afinfo->family == AF_UNSPEC || + sk->sk_family == afinfo->family) return sk; } spin_unlock(&ilb->lock); @@ -2284,11 +2287,16 @@ static inline bool empty_bucket(const struct tcp_iter_state *st) */ static void *established_get_first(struct seq_file *seq) { - struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); void *rc = NULL; + if (st->bpf_seq_afinfo) + afinfo = st->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + st->offset = 0; for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; @@ -2301,7 +2309,8 @@ static void *established_get_first(struct seq_file *seq) spin_lock_bh(lock); sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { - if (sk->sk_family != afinfo->family || + if ((afinfo->family != AF_UNSPEC && + sk->sk_family != afinfo->family) || !net_eq(sock_net(sk), net)) { continue; } @@ -2316,19 +2325,25 @@ out: static void *established_get_next(struct seq_file *seq, void *cur) { - struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct tcp_seq_afinfo *afinfo; struct sock *sk = cur; struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); + if (st->bpf_seq_afinfo) + afinfo = st->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + ++st->num; ++st->offset; sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { - if (sk->sk_family == afinfo->family && + if ((afinfo->family == AF_UNSPEC || + sk->sk_family == afinfo->family) && net_eq(sock_net(sk), net)) return sk; } @@ -2607,6 +2622,74 @@ out: return 0; } +#ifdef CONFIG_BPF_SYSCALL +struct bpf_iter__tcp { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct sock_common *, sk_common); + uid_t uid __aligned(8); +}; + +static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, + struct sock_common *sk_common, uid_t uid) +{ + struct bpf_iter__tcp ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.sk_common = sk_common; + ctx.uid = uid; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + struct sock *sk = v; + uid_t uid; + + if (v == SEQ_START_TOKEN) + return 0; + + if (sk->sk_state == TCP_TIME_WAIT) { + uid = 0; + } else if (sk->sk_state == TCP_NEW_SYN_RECV) { + const struct request_sock *req = v; + + uid = from_kuid_munged(seq_user_ns(seq), + sock_i_uid(req->rsk_listener)); + } else { + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + } + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + return tcp_prog_seq_show(prog, &meta, v, uid); +} + +static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)tcp_prog_seq_show(prog, &meta, v, 0); + } + + tcp_seq_stop(seq, v); +} + +static const struct seq_operations bpf_iter_tcp_seq_ops = { + .show = bpf_iter_tcp_seq_show, + .start = tcp_seq_start, + .next = tcp_seq_next, + .stop = bpf_iter_tcp_seq_stop, +}; +#endif + static const struct seq_operations tcp4_seq_ops = { .show = tcp4_seq_show, .start = tcp_seq_start, @@ -2687,10 +2770,6 @@ struct proto tcp_prot = { .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_tcp_setsockopt, - .compat_getsockopt = compat_tcp_getsockopt, -#endif .diag_destroy = tcp_abort, }; EXPORT_SYMBOL(tcp_prot); @@ -2838,8 +2917,68 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { .exit_batch = tcp_sk_exit_batch, }; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, + struct sock_common *sk_common, uid_t uid) + +static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct tcp_iter_state *st = priv_data; + struct tcp_seq_afinfo *afinfo; + int ret; + + afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); + if (!afinfo) + return -ENOMEM; + + afinfo->family = AF_UNSPEC; + st->bpf_seq_afinfo = afinfo; + ret = bpf_iter_init_seq_net(priv_data, aux); + if (ret) + kfree(afinfo); + return ret; +} + +static void bpf_iter_fini_tcp(void *priv_data) +{ + struct tcp_iter_state *st = priv_data; + + kfree(st->bpf_seq_afinfo); + bpf_iter_fini_seq_net(priv_data); +} + +static const struct bpf_iter_seq_info tcp_seq_info = { + .seq_ops = &bpf_iter_tcp_seq_ops, + .init_seq_private = bpf_iter_init_tcp, + .fini_seq_private = bpf_iter_fini_tcp, + .seq_priv_size = sizeof(struct tcp_iter_state), +}; + +static struct bpf_iter_reg tcp_reg_info = { + .target = "tcp", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__tcp, sk_common), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &tcp_seq_info, +}; + +static void __init bpf_iter_register(void) +{ + tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; + if (bpf_iter_reg_target(&tcp_reg_info)) + pr_warn("Warning: could not register bpf iterator tcp\n"); +} + +#endif + void __init tcp_v4_init(void) { if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); + +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_register(); +#endif } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0bc05d68cd74..85ff417bda7f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1066,6 +1066,10 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } +INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); +INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); +INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1209,7 +1213,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } #endif - icsk->icsk_af_ops->send_check(sk, skb); + INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, + tcp_v6_send_check, tcp_v4_send_check, + sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); @@ -1237,7 +1243,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcp_add_tx_delay(skb, tp); - err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, + inet6_csk_xmit, ip_queue_xmit, + sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); @@ -3332,6 +3340,8 @@ int tcp_send_synack(struct sock *sk) * sk: listener socket * dst: dst entry attached to the SYNACK * req: request_sock pointer + * foc: cookie for tcp fast open + * synack_type: Type of synback to prepare * * Allocate one skb and build a SYNACK packet. * @dst is consumed : Caller should not use it again. @@ -3383,7 +3393,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES - if (unlikely(req->cookie_ts)) + if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb->skb_mstamp_ns = cookie_init_timestamp(req, now); else #endif diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ada046f425d2..0c08c420fbc2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -314,7 +314,7 @@ out: /** * tcp_delack_timer() - The TCP delayed ACK timeout handler - * @data: Pointer to the current socket. (gets casted to struct sock *) + * @t: Pointer to the timer. (gets casted to struct sock *) * * This function gets (indirectly) called when the kernel timer for a TCP packet * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work. diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 50a9a6e2c4cd..cd50a61c9976 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -7,7 +7,7 @@ * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." * IEEE Journal on Selected Areas in Communication, * Feb. 2003. - * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf + * See https://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf */ #include <linux/mm.h> diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index c4b2ccbeba04..e44aaf41a138 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -110,6 +110,33 @@ drop: return 0; } +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) +static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err) +{ + struct xfrm_tunnel __rcu *head; + struct xfrm_tunnel *handler; + int ret; + + head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers; + + for_each_tunnel_rcu(head, handler) { + if (handler->cb_handler) { + ret = handler->cb_handler(skb, err); + if (ret <= 0) + return ret; + } + } + + return 0; +} + +static const struct xfrm_input_afinfo tunnel4_input_afinfo = { + .family = AF_INET, + .is_ipip = true, + .callback = tunnel4_rcv_cb, +}; +#endif + #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_rcv(struct sk_buff *skb) { @@ -231,6 +258,18 @@ static int __init tunnel4_init(void) goto err; } #endif +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) + if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) { + inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); +#if IS_ENABLED(CONFIG_IPV6) + inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); +#endif +#if IS_ENABLED(CONFIG_MPLS) + inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS); +#endif + goto err; + } +#endif return 0; err: @@ -240,6 +279,10 @@ err: static void __exit tunnel4_fini(void) { +#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) + if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo)) + pr_err("tunnel4 close: can't remove input afinfo\n"); +#endif #if IS_ENABLED(CONFIG_MPLS) if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) pr_err("tunnelmpls4 close: can't remove protocol\n"); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4077d589b72e..e88efba07551 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -106,6 +106,7 @@ #include <net/xfrm.h> #include <trace/events/udp.h> #include <linux/static_key.h> +#include <linux/btf_ids.h> #include <trace/events/skb.h> #include <net/busy_poll.h> #include "udp_impl.h" @@ -408,6 +409,22 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, udp_ehash_secret + net_hash_mix(net)); } +static struct sock *lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum) +{ + struct sock *reuse_sk = NULL; + u32 hash; + + if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { + hash = udp_ehashfn(net, daddr, hnum, saddr, sport); + reuse_sk = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + } + return reuse_sk; +} + /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, @@ -416,9 +433,8 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct udp_hslot *hslot2, struct sk_buff *skb) { - struct sock *sk, *result, *reuseport_result; + struct sock *sk, *result; int score, badness; - u32 hash = 0; result = NULL; badness = 0; @@ -426,25 +442,42 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - reuseport_result = NULL; - - if (sk->sk_reuseport && - sk->sk_state != TCP_ESTABLISHED) { - hash = udp_ehashfn(net, daddr, hnum, - saddr, sport); - reuseport_result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (reuseport_result && !reuseport_has_conns(sk, false)) - return reuseport_result; - } + result = lookup_reuseport(net, sk, skb, + saddr, sport, daddr, hnum); + /* Fall back to scoring if group has connections */ + if (result && !reuseport_has_conns(sk, false)) + return result; - result = reuseport_result ? : sk; + result = result ? : sk; badness = score; } } return result; } +static struct sock *udp4_lookup_run_bpf(struct net *net, + struct udp_table *udptable, + struct sk_buff *skb, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (udptable != &udp_table) + return NULL; /* only UDP is supported */ + + no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, + saddr, sport, daddr, hnum, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -452,27 +485,45 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { - struct sock *result; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2; struct udp_hslot *hslot2; + struct sock *result, *sk; hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; + /* Lookup connected or non-wildcard socket */ result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, hslot2, skb); - if (!result) { - hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; - - result = udp4_lib_lookup2(net, saddr, sport, - htonl(INADDR_ANY), hnum, dif, sdif, - hslot2, skb); + if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) + goto done; + + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + sk = udp4_lookup_run_bpf(net, udptable, skb, + saddr, sport, daddr, hnum); + if (sk) { + result = sk; + goto done; + } } + + /* Got non-wildcard socket or error on first lookup */ + if (result) + goto done; + + /* Lookup wildcard sockets */ + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + + result = udp4_lib_lookup2(net, saddr, sport, + htonl(INADDR_ANY), hnum, dif, sdif, + hslot2, skb); +done: if (IS_ERR(result)) return NULL; return result; @@ -2535,7 +2586,7 @@ void udp_destroy_sock(struct sock *sk) * Socket option code for UDP */ int udp_lib_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen, + sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); @@ -2546,7 +2597,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; @@ -2650,26 +2701,16 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL(udp_lib_setsockopt); -int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_setsockopt(sk, level, optname, optval, optlen, + return udp_lib_setsockopt(sk, level, optname, + optval, optlen, udp_push_pending_frames); return ip_setsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -int compat_udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_setsockopt(sk, level, optname, optval, optlen, - udp_push_pending_frames); - return compat_ip_setsockopt(sk, level, optname, optval, optlen); -} -#endif - int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -2735,20 +2776,11 @@ int udp_getsockopt(struct sock *sk, int level, int optname, return ip_getsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -int compat_udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_getsockopt(sk, level, optname, optval, optlen); - return compat_ip_getsockopt(sk, level, optname, optval, optlen); -} -#endif /** * udp_poll - wait for a UDP event. - * @file - file struct - * @sock - socket - * @wait - poll table + * @file: - file struct + * @sock: - socket + * @wait: - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd @@ -2815,10 +2847,6 @@ struct proto udp_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = &udp_table, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udp_setsockopt, - .compat_getsockopt = compat_udp_getsockopt, -#endif .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); @@ -2829,10 +2857,15 @@ EXPORT_SYMBOL(udp_prot); static struct sock *udp_get_first(struct seq_file *seq, int start) { struct sock *sk; - struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); + if (state->bpf_seq_afinfo) + afinfo = state->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + for (state->bucket = start; state->bucket <= afinfo->udp_table->mask; ++state->bucket) { struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket]; @@ -2844,7 +2877,8 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) sk_for_each(sk, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_family == afinfo->family) + if (afinfo->family == AF_UNSPEC || + sk->sk_family == afinfo->family) goto found; } spin_unlock_bh(&hslot->lock); @@ -2856,13 +2890,20 @@ found: static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) { - struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); + if (state->bpf_seq_afinfo) + afinfo = state->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + do { sk = sk_next(sk); - } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != afinfo->family)); + } while (sk && (!net_eq(sock_net(sk), net) || + (afinfo->family != AF_UNSPEC && + sk->sk_family != afinfo->family))); if (!sk) { if (state->bucket <= afinfo->udp_table->mask) @@ -2907,9 +2948,14 @@ EXPORT_SYMBOL(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { - struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); + struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; + if (state->bpf_seq_afinfo) + afinfo = state->bpf_seq_afinfo; + else + afinfo = PDE_DATA(file_inode(seq->file)); + if (state->bucket <= afinfo->udp_table->mask) spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock); } @@ -2953,6 +2999,67 @@ int udp4_seq_show(struct seq_file *seq, void *v) return 0; } +#ifdef CONFIG_BPF_SYSCALL +struct bpf_iter__udp { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct udp_sock *, udp_sk); + uid_t uid __aligned(8); + int bucket __aligned(8); +}; + +static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, + struct udp_sock *udp_sk, uid_t uid, int bucket) +{ + struct bpf_iter__udp ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.udp_sk = udp_sk; + ctx.uid = uid; + ctx.bucket = bucket; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v) +{ + struct udp_iter_state *state = seq->private; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + struct sock *sk = v; + uid_t uid; + + if (v == SEQ_START_TOKEN) + return 0; + + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + return udp_prog_seq_show(prog, &meta, v, uid, state->bucket); +} + +static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)udp_prog_seq_show(prog, &meta, v, 0, 0); + } + + udp_seq_stop(seq, v); +} + +static const struct seq_operations bpf_iter_udp_seq_ops = { + .start = udp_seq_start, + .next = udp_seq_next, + .stop = bpf_iter_udp_seq_stop, + .show = bpf_iter_udp_seq_show, +}; +#endif + const struct seq_operations udp_seq_ops = { .start = udp_seq_start, .next = udp_seq_next, @@ -3070,6 +3177,62 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = { .init = udp_sysctl_init, }; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta, + struct udp_sock *udp_sk, uid_t uid, int bucket) + +static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct udp_iter_state *st = priv_data; + struct udp_seq_afinfo *afinfo; + int ret; + + afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); + if (!afinfo) + return -ENOMEM; + + afinfo->family = AF_UNSPEC; + afinfo->udp_table = &udp_table; + st->bpf_seq_afinfo = afinfo; + ret = bpf_iter_init_seq_net(priv_data, aux); + if (ret) + kfree(afinfo); + return ret; +} + +static void bpf_iter_fini_udp(void *priv_data) +{ + struct udp_iter_state *st = priv_data; + + kfree(st->bpf_seq_afinfo); + bpf_iter_fini_seq_net(priv_data); +} + +static const struct bpf_iter_seq_info udp_seq_info = { + .seq_ops = &bpf_iter_udp_seq_ops, + .init_seq_private = bpf_iter_init_udp, + .fini_seq_private = bpf_iter_fini_udp, + .seq_priv_size = sizeof(struct udp_iter_state), +}; + +static struct bpf_iter_reg udp_reg_info = { + .target = "udp", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__udp, udp_sk), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &udp_seq_info, +}; + +static void __init bpf_iter_register(void) +{ + udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP]; + if (bpf_iter_reg_target(&udp_reg_info)) + pr_warn("Warning: could not register bpf iterator udp\n"); +} +#endif + void __init udp_init(void) { unsigned long limit; @@ -3095,4 +3258,8 @@ void __init udp_init(void) if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); + +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_register(); +#endif } diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 6b2fa77eeb1c..2878d8285caf 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -12,17 +12,11 @@ int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); int udp_v4_get_port(struct sock *sk, unsigned short snum); void udp_v4_rehash(struct sock *sk); -int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); +int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -#ifdef CONFIG_COMPAT -int compat_udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -int compat_udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -#endif int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel_core.c index 3eecba0874aa..3eecba0874aa 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel_core.c diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c new file mode 100644 index 000000000000..69962165c0e8 --- /dev/null +++ b/net/ipv4/udp_tunnel_nic.c @@ -0,0 +1,897 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2020 Facebook Inc. + +#include <linux/ethtool_netlink.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <net/udp_tunnel.h> +#include <net/vxlan.h> + +enum udp_tunnel_nic_table_entry_flags { + UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), + UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), + UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), + UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), +}; + +struct udp_tunnel_nic_table_entry { + __be16 port; + u8 type; + u8 use_cnt; + u8 flags; + u8 hw_priv; +}; + +/** + * struct udp_tunnel_nic - UDP tunnel port offload state + * @work: async work for talking to hardware from process context + * @dev: netdev pointer + * @need_sync: at least one port start changed + * @need_replay: space was freed, we need a replay of all ports + * @work_pending: @work is currently scheduled + * @n_tables: number of tables under @entries + * @missed: bitmap of tables which overflown + * @entries: table of tables of ports currently offloaded + */ +struct udp_tunnel_nic { + struct work_struct work; + + struct net_device *dev; + + u8 need_sync:1; + u8 need_replay:1; + u8 work_pending:1; + + unsigned int n_tables; + unsigned long missed; + struct udp_tunnel_nic_table_entry **entries; +}; + +/* We ensure all work structs are done using driver state, but not the code. + * We need a workqueue we can flush before module gets removed. + */ +static struct workqueue_struct *udp_tunnel_nic_workqueue; + +static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) +{ + switch (type) { + case UDP_TUNNEL_TYPE_VXLAN: + return "vxlan"; + case UDP_TUNNEL_TYPE_GENEVE: + return "geneve"; + case UDP_TUNNEL_TYPE_VXLAN_GPE: + return "vxlan-gpe"; + default: + return "unknown"; + } +} + +static bool +udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->use_cnt == 0 && !entry->flags; +} + +static bool +udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); +} + +static bool +udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static void +udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) +{ + if (!udp_tunnel_nic_entry_is_free(entry)) + entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static void +udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) +{ + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static bool +udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | + UDP_TUNNEL_NIC_ENTRY_DEL); +} + +static void +udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, + struct udp_tunnel_nic_table_entry *entry, + unsigned int flag) +{ + entry->flags |= flag; + utn->need_sync = 1; +} + +static void +udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, + struct udp_tunnel_info *ti) +{ + memset(ti, 0, sizeof(*ti)); + ti->port = entry->port; + ti->type = entry->type; + ti->hw_priv = entry->hw_priv; +} + +static bool +udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) + return false; + return true; +} + +static bool +udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + if (!utn->missed) + return false; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!test_bit(i, &utn->missed)) + continue; + + for (j = 0; j < table->n_entries; j++) + if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) + return true; + } + + return false; +} + +static void +__udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, + unsigned int idx, struct udp_tunnel_info *ti) +{ + struct udp_tunnel_nic_table_entry *entry; + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + entry = &utn->entries[table][idx]; + + if (entry->use_cnt) + udp_tunnel_nic_ti_from_entry(entry, ti); +} + +static void +__udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, + unsigned int idx, u8 priv) +{ + dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; +} + +static void +udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, + int err) +{ + bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + + WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && + entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); + + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && + (!err || (err == -EEXIST && dodgy))) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; + + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && + (!err || (err == -ENOENT && dodgy))) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; + + if (!err) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + else + entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; +} + +static void +udp_tunnel_nic_device_sync_one(struct net_device *dev, + struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx) +{ + struct udp_tunnel_nic_table_entry *entry; + struct udp_tunnel_info ti; + int err; + + entry = &utn->entries[table][idx]; + if (!udp_tunnel_nic_entry_is_queued(entry)) + return; + + udp_tunnel_nic_ti_from_entry(entry, &ti); + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) + err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); + else + err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, + &ti); + udp_tunnel_nic_entry_update_done(entry, err); + + if (err) + netdev_warn(dev, + "UDP tunnel port sync failed port %d type %s: %d\n", + be16_to_cpu(entry->port), + udp_tunnel_nic_tunnel_type_name(entry->type), + err); +} + +static void +udp_tunnel_nic_device_sync_by_port(struct net_device *dev, + struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_device_sync_one(dev, utn, i, j); +} + +static void +udp_tunnel_nic_device_sync_by_table(struct net_device *dev, + struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + int err; + + for (i = 0; i < utn->n_tables; i++) { + /* Find something that needs sync in this table */ + for (j = 0; j < info->tables[i].n_entries; j++) + if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) + break; + if (j == info->tables[i].n_entries) + continue; + + err = info->sync_table(dev, i); + if (err) + netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", + i, err); + + for (j = 0; j < info->tables[i].n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + if (udp_tunnel_nic_entry_is_queued(entry)) + udp_tunnel_nic_entry_update_done(entry, err); + } + } +} + +static void +__udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + if (!utn->need_sync) + return; + + if (dev->udp_tunnel_nic_info->sync_table) + udp_tunnel_nic_device_sync_by_table(dev, utn); + else + udp_tunnel_nic_device_sync_by_port(dev, utn); + + utn->need_sync = 0; + /* Can't replay directly here, in case we come from the tunnel driver's + * notification - trying to replay may deadlock inside tunnel driver. + */ + utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); +} + +static void +udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + bool may_sleep; + + if (!utn->need_sync) + return; + + /* Drivers which sleep in the callback need to update from + * the workqueue, if we come from the tunnel driver's notification. + */ + may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; + if (!may_sleep) + __udp_tunnel_nic_device_sync(dev, utn); + if (may_sleep || utn->need_replay) { + queue_work(udp_tunnel_nic_workqueue, &utn->work); + utn->work_pending = 1; + } +} + +static bool +udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, + struct udp_tunnel_info *ti) +{ + return table->tunnel_types & ti->type; +} + +static bool +udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i; + + /* Special case IPv4-only NICs */ + if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && + ti->sa_family != AF_INET) + return false; + + for (i = 0; i < utn->n_tables; i++) + if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) + return true; + return false; +} + +static int +udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic_table_entry *entry; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + entry = &utn->entries[i][j]; + + if (!udp_tunnel_nic_entry_is_free(entry) && + entry->port == ti->port && + entry->type != ti->type) { + __set_bit(i, &utn->missed); + return true; + } + } + return false; +} + +static void +udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx, int use_cnt_adj) +{ + struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; + bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + unsigned int from, to; + + /* If not going from used to unused or vice versa - all done. + * For dodgy entries make sure we try to sync again (queue the entry). + */ + entry->use_cnt += use_cnt_adj; + if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) + return; + + /* Cancel the op before it was sent to the device, if possible, + * otherwise we'd need to take special care to issue commands + * in the same order the ports arrived. + */ + if (use_cnt_adj < 0) { + from = UDP_TUNNEL_NIC_ENTRY_ADD; + to = UDP_TUNNEL_NIC_ENTRY_DEL; + } else { + from = UDP_TUNNEL_NIC_ENTRY_DEL; + to = UDP_TUNNEL_NIC_ENTRY_ADD; + } + + if (entry->flags & from) { + entry->flags &= ~from; + if (!dodgy) + return; + } + + udp_tunnel_nic_entry_queue(utn, entry, to); +} + +static bool +udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx, + struct udp_tunnel_info *ti, int use_cnt_adj) +{ + struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; + + if (udp_tunnel_nic_entry_is_free(entry) || + entry->port != ti->port || + entry->type != ti->type) + return false; + + if (udp_tunnel_nic_entry_is_frozen(entry)) + return true; + + udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); + return true; +} + +/* Try to find existing matching entry and adjust its use count, instead of + * adding a new one. Returns true if entry was found. In case of delete the + * entry may have gotten removed in the process, in which case it will be + * queued for removal. + */ +static bool +udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti, int use_cnt_adj) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!udp_tunnel_nic_table_is_capable(table, ti)) + continue; + + for (j = 0; j < table->n_entries; j++) + if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, + use_cnt_adj)) + return true; + } + + return false; +} + +static bool +udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + return udp_tunnel_nic_try_existing(dev, utn, ti, +1); +} + +static bool +udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + return udp_tunnel_nic_try_existing(dev, utn, ti, -1); +} + +static bool +udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!udp_tunnel_nic_table_is_capable(table, ti)) + continue; + + for (j = 0; j < table->n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + if (!udp_tunnel_nic_entry_is_free(entry)) + continue; + + entry->port = ti->port; + entry->type = ti->type; + entry->use_cnt = 1; + udp_tunnel_nic_entry_queue(utn, entry, + UDP_TUNNEL_NIC_ENTRY_ADD); + return true; + } + + /* The different table may still fit this port in, but there + * are no devices currently which have multiple tables accepting + * the same tunnel type, and false positives are okay. + */ + __set_bit(i, &utn->missed); + } + + return false; +} + +static void +__udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) + return; + if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN && + ti->port == htons(IANA_VXLAN_UDP_PORT)) { + if (ti->type != UDP_TUNNEL_TYPE_VXLAN) + netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n"); + return; + } + + if (!udp_tunnel_nic_is_capable(dev, utn, ti)) + return; + + /* It may happen that a tunnel of one type is removed and different + * tunnel type tries to reuse its port before the device was informed. + * Rely on utn->missed to re-add this port later. + */ + if (udp_tunnel_nic_has_collision(dev, utn, ti)) + return; + + if (!udp_tunnel_nic_add_existing(dev, utn, ti)) + udp_tunnel_nic_add_new(dev, utn, ti); + + udp_tunnel_nic_device_sync(dev, utn); +} + +static void +__udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) +{ + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + + if (!udp_tunnel_nic_is_capable(dev, utn, ti)) + return; + + udp_tunnel_nic_del_existing(dev, utn, ti); + + udp_tunnel_nic_device_sync(dev, utn); +} + +static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int i, j; + + ASSERT_RTNL(); + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + + utn->need_sync = false; + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + + entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | + UDP_TUNNEL_NIC_ENTRY_OP_FAIL); + /* We don't release rtnl across ops */ + WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); + if (!entry->use_cnt) + continue; + + udp_tunnel_nic_entry_queue(utn, entry, + UDP_TUNNEL_NIC_ENTRY_ADD); + } + + __udp_tunnel_nic_device_sync(dev, utn); +} + +static size_t +__udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int j; + size_t size; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + size = 0; + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + size += nla_total_size(0) + /* _TABLE_ENTRY */ + nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ + nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ + } + + return size; +} + +static int +__udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, + struct sk_buff *skb) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + struct nlattr *nest; + unsigned int j; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); + + if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, + utn->entries[table][j].port) || + nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, + ilog2(utn->entries[table][j].type))) + goto err_cancel; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { + .get_port = __udp_tunnel_nic_get_port, + .set_port_priv = __udp_tunnel_nic_set_port_priv, + .add_port = __udp_tunnel_nic_add_port, + .del_port = __udp_tunnel_nic_del_port, + .reset_ntf = __udp_tunnel_nic_reset_ntf, + .dump_size = __udp_tunnel_nic_dump_size, + .dump_write = __udp_tunnel_nic_dump_write, +}; + +static void +udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + int adj_cnt = -utn->entries[i][j].use_cnt; + + if (adj_cnt) + udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); + } + + __udp_tunnel_nic_device_sync(dev, utn); + + for (i = 0; i < utn->n_tables; i++) + memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, + sizeof(**utn->entries))); + WARN_ON(utn->need_sync); + utn->need_replay = 0; +} + +static void +udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + /* Freeze all the ports we are already tracking so that the replay + * does not double up the refcount. + */ + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); + utn->missed = 0; + utn->need_replay = 0; + + udp_tunnel_get_rx_info(dev); + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); +} + +static void udp_tunnel_nic_device_sync_work(struct work_struct *work) +{ + struct udp_tunnel_nic *utn = + container_of(work, struct udp_tunnel_nic, work); + + rtnl_lock(); + utn->work_pending = 0; + __udp_tunnel_nic_device_sync(utn->dev, utn); + + if (utn->need_replay) + udp_tunnel_nic_replay(utn->dev, utn); + rtnl_unlock(); +} + +static struct udp_tunnel_nic * +udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, + unsigned int n_tables) +{ + struct udp_tunnel_nic *utn; + unsigned int i; + + utn = kzalloc(sizeof(*utn), GFP_KERNEL); + if (!utn) + return NULL; + utn->n_tables = n_tables; + INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); + + utn->entries = kmalloc_array(n_tables, sizeof(void *), GFP_KERNEL); + if (!utn->entries) + goto err_free_utn; + + for (i = 0; i < n_tables; i++) { + utn->entries[i] = kcalloc(info->tables[i].n_entries, + sizeof(*utn->entries[i]), GFP_KERNEL); + if (!utn->entries[i]) + goto err_free_prev_entries; + } + + return utn; + +err_free_prev_entries: + while (i--) + kfree(utn->entries[i]); + kfree(utn->entries); +err_free_utn: + kfree(utn); + return NULL; +} + +static int udp_tunnel_nic_register(struct net_device *dev) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int n_tables, i; + + BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < + UDP_TUNNEL_NIC_MAX_TABLES); + + if (WARN_ON(!info->set_port != !info->unset_port) || + WARN_ON(!info->set_port == !info->sync_table) || + WARN_ON(!info->tables[0].n_entries)) + return -EINVAL; + + n_tables = 1; + for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + continue; + + n_tables++; + if (WARN_ON(!info->tables[i - 1].n_entries)) + return -EINVAL; + } + + utn = udp_tunnel_nic_alloc(info, n_tables); + if (!utn) + return -ENOMEM; + + utn->dev = dev; + dev_hold(dev); + dev->udp_tunnel_nic = utn; + + if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) + udp_tunnel_get_rx_info(dev); + + return 0; +} + +static void +udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + unsigned int i; + + /* Flush before we check work, so we don't waste time adding entries + * from the work which we will boot immediately. + */ + udp_tunnel_nic_flush(dev, utn); + + /* Wait for the work to be done using the state, netdev core will + * retry unregister until we give up our reference on this device. + */ + if (utn->work_pending) + return; + + for (i = 0; i < utn->n_tables; i++) + kfree(utn->entries[i]); + kfree(utn->entries); + kfree(utn); + dev->udp_tunnel_nic = NULL; + dev_put(dev); +} + +static int +udp_tunnel_nic_netdevice_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + const struct udp_tunnel_nic_info *info; + struct udp_tunnel_nic *utn; + + info = dev->udp_tunnel_nic_info; + if (!info) + return NOTIFY_DONE; + + if (event == NETDEV_REGISTER) { + int err; + + err = udp_tunnel_nic_register(dev); + if (err) + netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); + return notifier_from_errno(err); + } + /* All other events will need the udp_tunnel_nic state */ + utn = dev->udp_tunnel_nic; + if (!utn) + return NOTIFY_DONE; + + if (event == NETDEV_UNREGISTER) { + udp_tunnel_nic_unregister(dev, utn); + return NOTIFY_OK; + } + + /* All other events only matter if NIC has to be programmed open */ + if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) + return NOTIFY_DONE; + + if (event == NETDEV_UP) { + WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); + udp_tunnel_get_rx_info(dev); + return NOTIFY_OK; + } + if (event == NETDEV_GOING_DOWN) { + udp_tunnel_nic_flush(dev, utn); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { + .notifier_call = udp_tunnel_nic_netdevice_event, +}; + +static int __init udp_tunnel_nic_init_module(void) +{ + int err; + + udp_tunnel_nic_workqueue = alloc_workqueue("udp_tunnel_nic", 0, 0); + if (!udp_tunnel_nic_workqueue) + return -ENOMEM; + + rtnl_lock(); + udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; + rtnl_unlock(); + + err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); + if (err) + goto err_unset_ops; + + return 0; + +err_unset_ops: + rtnl_lock(); + udp_tunnel_nic_ops = NULL; + rtnl_unlock(); + destroy_workqueue(udp_tunnel_nic_workqueue); + return err; +} +late_initcall(udp_tunnel_nic_init_module); + +static void __exit udp_tunnel_nic_cleanup_module(void) +{ + unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); + + rtnl_lock(); + udp_tunnel_nic_ops = NULL; + rtnl_unlock(); + + destroy_workqueue(udp_tunnel_nic_workqueue); +} +module_exit(udp_tunnel_nic_cleanup_module); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/udp_tunnel_stub.c b/net/ipv4/udp_tunnel_stub.c new file mode 100644 index 000000000000..c4b2888f5fef --- /dev/null +++ b/net/ipv4/udp_tunnel_stub.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2020 Facebook Inc. + +#include <net/udp_tunnel.h> + +const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops; +EXPORT_SYMBOL_GPL(udp_tunnel_nic_ops); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 5936d66d1ce2..bd8773b49e72 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -56,10 +56,6 @@ struct proto udplite_prot = { .sysctl_mem = sysctl_udp_mem, .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udp_setsockopt, - .compat_getsockopt = compat_udp_getsockopt, -#endif }; EXPORT_SYMBOL(udplite_prot); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index f4f19e89af5e..76bff79d6fed 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -14,7 +14,7 @@ menuconfig IPV6 <https://en.wikipedia.org/wiki/IPv6>. For specific information about IPv6 under Linux, see Documentation/networking/ipv6.rst and read the HOWTO at - <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/> + <https://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/> To compile this protocol support as a module, choose M here: the module will be called ipv6. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 840bfdb3d7bd..8e761b8c47c6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -163,7 +163,7 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); static void addrconf_type_change(struct net_device *dev, unsigned long event); -static int addrconf_ifdown(struct net_device *dev, int how); +static int addrconf_ifdown(struct net_device *dev, bool unregister); static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, int plen, @@ -1983,6 +1983,45 @@ int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) } EXPORT_SYMBOL(ipv6_chk_prefix); +/** + * ipv6_dev_find - find the first device with a given source address. + * @net: the net namespace + * @addr: the source address + * + * The caller should be protected by RCU, or RTNL. + */ +struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr) +{ + unsigned int hash = inet6_addr_hash(net, addr); + struct inet6_ifaddr *ifp, *result = NULL; + struct net_device *dev = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { + if (net_eq(dev_net(ifp->idev->dev), net) && + ipv6_addr_equal(&ifp->addr, addr)) { + result = ifp; + break; + } + } + + if (!result) { + struct rt6_info *rt; + + rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); + if (rt) { + dev = rt->dst.dev; + ip6_rt_put(rt); + } + } else { + dev = result->idev->dev; + } + rcu_read_unlock(); + + return dev; +} +EXPORT_SYMBOL(ipv6_dev_find); + struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict) { @@ -3630,7 +3669,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, * an L3 master device (e.g., VRF) */ if (info->upper_dev && netif_is_l3_master(info->upper_dev)) - addrconf_ifdown(dev, 0); + addrconf_ifdown(dev, false); } return NOTIFY_OK; @@ -3663,9 +3702,9 @@ static bool addr_is_local(const struct in6_addr *addr) (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static int addrconf_ifdown(struct net_device *dev, int how) +static int addrconf_ifdown(struct net_device *dev, bool unregister) { - unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN; + unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN; struct net *net = dev_net(dev); struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; @@ -3684,7 +3723,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) * Step 1: remove reference to ipv6 device from parent device. * Do not dev_put! */ - if (how) { + if (unregister) { idev->dead = 1; /* protected by rtnl_lock */ @@ -3698,7 +3737,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) /* combine the user config with event to determine if permanent * addresses are to be removed from address hash table */ - if (!how && !idev->cnf.disable_ipv6) { + if (!unregister && !idev->cnf.disable_ipv6) { /* aggregate the system setting and interface setting */ int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down; @@ -3736,7 +3775,7 @@ restart: addrconf_del_rs_timer(idev); /* Step 2: clear flags for stateless addrconf */ - if (!how) + if (!unregister) idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); /* Step 3: clear tempaddr list */ @@ -3806,7 +3845,7 @@ restart: write_unlock_bh(&idev->lock); /* Step 5: Discard anycast and multicast list */ - if (how) { + if (unregister) { ipv6_ac_destroy_dev(idev); ipv6_mc_destroy_dev(idev); } else { @@ -3816,7 +3855,7 @@ restart: idev->tstamp = jiffies; /* Last: Shot the device (if unregistered) */ - if (how) { + if (unregister) { addrconf_sysctl_unregister(idev); neigh_parms_release(&nd_tbl, idev->nd_parms); neigh_ifdown(&nd_tbl, dev); @@ -4038,7 +4077,7 @@ static void addrconf_dad_work(struct work_struct *w) in6_ifa_hold(ifp); addrconf_dad_stop(ifp, 1); if (disable_ipv6) - addrconf_ifdown(idev->dev, 0); + addrconf_ifdown(idev->dev, false); goto out; } @@ -7187,9 +7226,9 @@ void addrconf_cleanup(void) for_each_netdev(&init_net, dev) { if (__in6_dev_get(dev) == NULL) continue; - addrconf_ifdown(dev, 1); + addrconf_ifdown(dev, true); } - addrconf_ifdown(init_net.loopback_dev, 2); + addrconf_ifdown(init_net.loopback_dev, true); /* * Check hash table. diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b304b882e031..0306509ab063 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -688,8 +688,6 @@ const struct proto_ops inet6_stream_ops = { .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif .set_rcvlowat = tcp_set_rcvlowat, }; @@ -717,8 +715,6 @@ const struct proto_ops inet6_dgram_ops = { .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 390bedde21a5..cc8ad7ddecda 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -19,6 +19,7 @@ #include <linux/route.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/icmp.h> #include <net/ipv6.h> #include <net/ndisc.h> @@ -284,6 +285,17 @@ int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, } EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); +static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out) +{ + switch (icmp6_hdr(skb)->icmp6_type) { + case ICMPV6_TIME_EXCEED: + case ICMPV6_DEST_UNREACH: + ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr), + icmp6_hdr(skb)->icmp6_datagram_len * 8); + } +} + void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { @@ -313,6 +325,10 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; __skb_pull(skb, payload - skb->data); + + if (inet6_sk(sk)->recverr_rfc4884) + ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 5a8bbcdcaf2b..374105e4394f 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -580,7 +580,7 @@ looped_back: hdr->segments_left--; i = n - hdr->segments_left; - buf = kzalloc(ipv6_rpl_srh_alloc_size(n + 1) * 2, GFP_ATOMIC); + buf = kcalloc(struct_size(hdr, segments.addr, n + 2), 2, GFP_ATOMIC); if (unlikely(!buf)) { kfree_skb(skb); return -1; @@ -1232,7 +1232,6 @@ static void ipv6_renew_option(int renewtype, * @opt: original options * @newtype: option type to replace in @opt * @newopt: new option of type @newtype to replace (user-mem) - * @newoptlen: length of @newopt * * Returns a new set of options which is a copy of @opt with the * option type @newtype replaced with @newopt. diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index fafe556d21e0..8f9a83314de7 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -13,6 +13,7 @@ #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/export.h> +#include <linux/indirect_call_wrapper.h> #include <net/fib_rules.h> #include <net/ipv6.h> @@ -111,11 +112,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, } else { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); + rt = pol_lookup_func(lookup, + net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put_flags(rt, flags); - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); + rt = pol_lookup_func(lookup, + net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put_flags(rt, flags); @@ -226,7 +229,8 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } - rt = lookup(net, table, flp6, arg->lookup_data, flags); + rt = pol_lookup_func(lookup, + net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { err = fib6_rule_saddr(net, rule, flags, flp6, ip6_dst_idev(&rt->dst)->dev); @@ -252,8 +256,9 @@ out: return err; } -static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, - int flags, struct fib_lookup_arg *arg) +INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule, + struct flowi *flp, int flags, + struct fib_lookup_arg *arg) { if (arg->lookup_ptr == fib6_table_lookup) return fib6_rule_action_alt(rule, flp, flags, arg); @@ -261,7 +266,8 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, return __fib6_rule_action(rule, flp, flags, arg); } -static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule, + struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; struct rt6_info *rt = res->rt6; @@ -293,7 +299,8 @@ suppress_route: return true; } -static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) +INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, + struct flowi *fl, int flags) { struct fib6_rule *r = (struct fib6_rule *) rule; struct flowi6 *fl6 = &fl->u.ip6; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 9df8737ae0d3..a4e4912ad607 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -439,8 +439,8 @@ static int icmp6_iif(const struct sk_buff *skb) /* * Send an ICMP message in response to a packet in error */ -static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, - const struct in6_addr *force_saddr) +void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, + const struct in6_addr *force_saddr) { struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -625,6 +625,7 @@ out: out_bh_enable: local_bh_enable(); } +EXPORT_SYMBOL(icmp6_send); /* Slightly more convenient version of icmp6_send. */ diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index fbe9d4295eac..2d3add9e6116 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -21,6 +21,8 @@ #include <net/ip.h> #include <net/sock_reuseport.h> +extern struct inet_hashinfo tcp_hashinfo; + u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) @@ -111,6 +113,23 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } +static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + unsigned short hnum) +{ + struct sock *reuse_sk = NULL; + u32 phash; + + if (sk->sk_reuseport) { + phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + reuse_sk = reuseport_select_sock(sk, phash, skb, doff); + } + return reuse_sk; +} + /* called with rcu_read_lock() */ static struct sock *inet6_lhash2_lookup(struct net *net, struct inet_listen_hashbucket *ilb2, @@ -123,21 +142,17 @@ static struct sock *inet6_lhash2_lookup(struct net *net, struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; int score, hiscore = 0; - u32 phash = 0; inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { sk = (struct sock *)icsk; score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - if (sk->sk_reuseport) { - phash = inet6_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, phash, - skb, doff); - if (result) - return result; - } + result = lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum); + if (result) + return result; + result = sk; hiscore = score; } @@ -146,6 +161,31 @@ static struct sock *inet6_lhash2_lookup(struct net *net, return result; } +static inline struct sock *inet6_lookup_run_bpf(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (hashinfo != &tcp_hashinfo) + return NULL; /* only TCP is supported */ + + no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP, + saddr, sport, daddr, hnum, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -157,6 +197,14 @@ struct sock *inet6_lookup_listener(struct net *net, struct sock *result = NULL; unsigned int hash2; + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + result = inet6_lookup_run_bpf(net, hashinfo, skb, doff, + saddr, sport, daddr, hnum); + if (result) + goto done; + } + hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 49ee89bbcba0..25a90f3f705c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -314,7 +314,8 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); + rt = pol_lookup_func(lookup, + net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 73bb047e6037..aa673a6a7e43 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -371,7 +371,7 @@ static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned lo static struct ip6_flowlabel * fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, - char __user *optval, int optlen, int *err_p) + sockptr_t optval, int optlen, int *err_p) { struct ip6_flowlabel *fl = NULL; int olen; @@ -401,7 +401,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, memset(fl->opt, 0, sizeof(*fl->opt)); fl->opt->tot_len = sizeof(*fl->opt) + olen; err = -EFAULT; - if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen)) + if (copy_from_sockptr_offset(fl->opt + 1, optval, + CMSG_ALIGN(sizeof(*freq)), olen)) goto done; msg.msg_controllen = olen; @@ -533,187 +534,212 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, return -ENOENT; } -int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) +#define socklist_dereference(__sflp) \ + rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock)) + +static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) { - int err; - struct net *net = sock_net(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_flowlabel_req freq; - struct ipv6_fl_socklist *sfl1 = NULL; - struct ipv6_fl_socklist *sfl; struct ipv6_fl_socklist __rcu **sflp; - struct ip6_flowlabel *fl, *fl1 = NULL; + struct ipv6_fl_socklist *sfl; + if (freq->flr_flags & IPV6_FL_F_REFLECT) { + if (sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + if (!np->repflow) + return -ESRCH; + np->flow_label = 0; + np->repflow = 0; + return 0; + } - if (optlen < sizeof(freq)) - return -EINVAL; + spin_lock_bh(&ip6_sk_fl_lock); + for (sflp = &np->ipv6_fl_list; + (sfl = socklist_dereference(*sflp)) != NULL; + sflp = &sfl->next) { + if (sfl->fl->label == freq->flr_label) + goto found; + } + spin_unlock_bh(&ip6_sk_fl_lock); + return -ESRCH; +found: + if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK)) + np->flow_label &= ~IPV6_FLOWLABEL_MASK; + *sflp = sfl->next; + spin_unlock_bh(&ip6_sk_fl_lock); + fl_release(sfl->fl); + kfree_rcu(sfl, rcu); + return 0; +} - if (copy_from_user(&freq, optval, sizeof(freq))) - return -EFAULT; +static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + struct ipv6_fl_socklist *sfl; + int err; - switch (freq.flr_action) { - case IPV6_FL_A_PUT: - if (freq.flr_flags & IPV6_FL_F_REFLECT) { - if (sk->sk_protocol != IPPROTO_TCP) - return -ENOPROTOOPT; - if (!np->repflow) - return -ESRCH; - np->flow_label = 0; - np->repflow = 0; - return 0; - } - spin_lock_bh(&ip6_sk_fl_lock); - for (sflp = &np->ipv6_fl_list; - (sfl = rcu_dereference_protected(*sflp, - lockdep_is_held(&ip6_sk_fl_lock))) != NULL; - sflp = &sfl->next) { - if (sfl->fl->label == freq.flr_label) { - if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) - np->flow_label &= ~IPV6_FLOWLABEL_MASK; - *sflp = sfl->next; - spin_unlock_bh(&ip6_sk_fl_lock); - fl_release(sfl->fl); - kfree_rcu(sfl, rcu); - return 0; - } + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { + if (sfl->fl->label == freq->flr_label) { + err = fl6_renew(sfl->fl, freq->flr_linger, + freq->flr_expires); + rcu_read_unlock_bh(); + return err; } - spin_unlock_bh(&ip6_sk_fl_lock); - return -ESRCH; + } + rcu_read_unlock_bh(); - case IPV6_FL_A_RENEW: - rcu_read_lock_bh(); - for_each_sk_fl_rcu(np, sfl) { - if (sfl->fl->label == freq.flr_label) { - err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires); - rcu_read_unlock_bh(); - return err; - } - } - rcu_read_unlock_bh(); + if (freq->flr_share == IPV6_FL_S_NONE && + ns_capable(net->user_ns, CAP_NET_ADMIN)) { + struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label); - if (freq.flr_share == IPV6_FL_S_NONE && - ns_capable(net->user_ns, CAP_NET_ADMIN)) { - fl = fl_lookup(net, freq.flr_label); - if (fl) { - err = fl6_renew(fl, freq.flr_linger, freq.flr_expires); - fl_release(fl); - return err; - } + if (fl) { + err = fl6_renew(fl, freq->flr_linger, + freq->flr_expires); + fl_release(fl); + return err; } - return -ESRCH; - - case IPV6_FL_A_GET: - if (freq.flr_flags & IPV6_FL_F_REFLECT) { - struct net *net = sock_net(sk); - if (net->ipv6.sysctl.flowlabel_consistency) { - net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); - return -EPERM; - } + } + return -ESRCH; +} - if (sk->sk_protocol != IPPROTO_TCP) - return -ENOPROTOOPT; +static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, + sockptr_t optval, int optlen) +{ + struct ipv6_fl_socklist *sfl, *sfl1 = NULL; + struct ip6_flowlabel *fl, *fl1 = NULL; + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + int err; - np->repflow = 1; - return 0; + if (freq->flr_flags & IPV6_FL_F_REFLECT) { + if (net->ipv6.sysctl.flowlabel_consistency) { + net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); + return -EPERM; } - if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) - return -EINVAL; + if (sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + np->repflow = 1; + return 0; + } - if (net->ipv6.sysctl.flowlabel_state_ranges && - (freq.flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) - return -ERANGE; + if (freq->flr_label & ~IPV6_FLOWLABEL_MASK) + return -EINVAL; + if (net->ipv6.sysctl.flowlabel_state_ranges && + (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) + return -ERANGE; - fl = fl_create(net, sk, &freq, optval, optlen, &err); - if (!fl) - return err; - sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); + fl = fl_create(net, sk, freq, optval, optlen, &err); + if (!fl) + return err; - if (freq.flr_label) { - err = -EEXIST; - rcu_read_lock_bh(); - for_each_sk_fl_rcu(np, sfl) { - if (sfl->fl->label == freq.flr_label) { - if (freq.flr_flags&IPV6_FL_F_EXCL) { - rcu_read_unlock_bh(); - goto done; - } - fl1 = sfl->fl; - if (!atomic_inc_not_zero(&fl1->users)) - fl1 = NULL; - break; + sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); + + if (freq->flr_label) { + err = -EEXIST; + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { + if (sfl->fl->label == freq->flr_label) { + if (freq->flr_flags & IPV6_FL_F_EXCL) { + rcu_read_unlock_bh(); + goto done; } + fl1 = sfl->fl; + if (!atomic_inc_not_zero(&fl1->users)) + fl1 = NULL; + break; } - rcu_read_unlock_bh(); + } + rcu_read_unlock_bh(); - if (!fl1) - fl1 = fl_lookup(net, freq.flr_label); - if (fl1) { + if (!fl1) + fl1 = fl_lookup(net, freq->flr_label); + if (fl1) { recheck: - err = -EEXIST; - if (freq.flr_flags&IPV6_FL_F_EXCL) - goto release; - err = -EPERM; - if (fl1->share == IPV6_FL_S_EXCL || - fl1->share != fl->share || - ((fl1->share == IPV6_FL_S_PROCESS) && - (fl1->owner.pid != fl->owner.pid)) || - ((fl1->share == IPV6_FL_S_USER) && - !uid_eq(fl1->owner.uid, fl->owner.uid))) - goto release; - - err = -ENOMEM; - if (!sfl1) - goto release; - if (fl->linger > fl1->linger) - fl1->linger = fl->linger; - if ((long)(fl->expires - fl1->expires) > 0) - fl1->expires = fl->expires; - fl_link(np, sfl1, fl1); - fl_free(fl); - return 0; + err = -EEXIST; + if (freq->flr_flags&IPV6_FL_F_EXCL) + goto release; + err = -EPERM; + if (fl1->share == IPV6_FL_S_EXCL || + fl1->share != fl->share || + ((fl1->share == IPV6_FL_S_PROCESS) && + (fl1->owner.pid != fl->owner.pid)) || + ((fl1->share == IPV6_FL_S_USER) && + !uid_eq(fl1->owner.uid, fl->owner.uid))) + goto release; + + err = -ENOMEM; + if (!sfl1) + goto release; + if (fl->linger > fl1->linger) + fl1->linger = fl->linger; + if ((long)(fl->expires - fl1->expires) > 0) + fl1->expires = fl->expires; + fl_link(np, sfl1, fl1); + fl_free(fl); + return 0; release: - fl_release(fl1); - goto done; - } - } - err = -ENOENT; - if (!(freq.flr_flags&IPV6_FL_F_CREATE)) - goto done; - - err = -ENOMEM; - if (!sfl1) + fl_release(fl1); goto done; + } + } + err = -ENOENT; + if (!(freq->flr_flags & IPV6_FL_F_CREATE)) + goto done; - err = mem_check(sk); - if (err != 0) - goto done; + err = -ENOMEM; + if (!sfl1) + goto done; - fl1 = fl_intern(net, fl, freq.flr_label); - if (fl1) - goto recheck; + err = mem_check(sk); + if (err != 0) + goto done; - if (!freq.flr_label) { - if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label, - &fl->label, sizeof(fl->label))) { - /* Intentionally ignore fault. */ - } - } + fl1 = fl_intern(net, fl, freq->flr_label); + if (fl1) + goto recheck; - fl_link(np, sfl1, fl); - return 0; + if (!freq->flr_label) { + size_t offset = offsetof(struct in6_flowlabel_req, flr_label); - default: - return -EINVAL; + if (copy_to_sockptr_offset(optval, offset, &fl->label, + sizeof(fl->label))) { + /* Intentionally ignore fault. */ + } } + fl_link(np, sfl1, fl); + return 0; done: fl_free(fl); kfree(sfl1); return err; } +int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen) +{ + struct in6_flowlabel_req freq; + + if (optlen < sizeof(freq)) + return -EINVAL; + if (copy_from_sockptr(&freq, optval, sizeof(freq))) + return -EFAULT; + + switch (freq.flr_action) { + case IPV6_FL_A_PUT: + return ipv6_flowlabel_put(sk, &freq); + case IPV6_FL_A_RENEW: + return ipv6_flowlabel_renew(sk, &freq); + case IPV6_FL_A_GET: + return ipv6_flowlabel_get(sk, &freq, optval, optlen); + default: + return -EINVAL; + } +} + #ifdef CONFIG_PROC_FS struct ip6fl_iter_state { diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index e0086758b6ee..70c8c2f36c98 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -9,6 +9,8 @@ #if IS_ENABLED(CONFIG_IPV6) +#if !IS_BUILTIN(CONFIG_IPV6) + static ip6_icmp_send_t __rcu *ip6_icmp_send; int inet6_register_icmp_sender(ip6_icmp_send_t *fn) @@ -37,14 +39,12 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); - - if (!send) - goto out; - send(skb, type, code, info, NULL); -out: + if (send) + send(skb, type, code, info, NULL); rcu_read_unlock(); } EXPORT_SYMBOL(icmpv6_send); +#endif #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_conntrack.h> diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 7fbb44736a34..a80f90bf3ae7 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -13,6 +13,8 @@ #include <net/protocol.h> #include <net/ipv6.h> #include <net/inet_common.h> +#include <net/tcp.h> +#include <net/udp.h> #include "ip6_offload.h" @@ -177,10 +179,6 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, return len; } -INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, - struct sk_buff *)); INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, struct sk_buff *skb) { @@ -319,8 +317,6 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, return inet_gro_receive(head, skb); } -INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8a8c2d0cfcc8..c78e67d7747f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1118,6 +1118,7 @@ out_err_release: /** * ip6_dst_lookup - perform route lookup on flow + * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @dst: pointer to dst_entry * for result * @fl6: flow to lookup @@ -1136,6 +1137,7 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup); /** * ip6_dst_lookup_flow - perform route lookup on flow with ipsec + * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup @@ -1202,11 +1204,11 @@ EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); * @skb: Packet for which lookup is done * @dev: Tunnel device * @net: Network namespace of tunnel device - * @sk: Socket which provides route info + * @sock: Socket which provides route info * @saddr: Memory to store the src ip address * @info: Tunnel information * @protocol: IP protocol - * @use_cahce: Flag to enable cache usage + * @use_cache: Flag to enable cache usage * This function performs a route lookup on a tunnel * * It returns a valid dst pointer and stores src address to be used in diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a18c378ca5f4..f635914f42ec 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -124,8 +124,12 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) return &dev->stats; } +#define for_each_ip6_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses + * @net: network namespace * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point @@ -136,9 +140,6 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * else %NULL **/ -#define for_each_ip6_tunnel_rcu(start) \ - for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) - static struct ip6_tnl * ip6_tnl_lookup(struct net *net, int link, const struct in6_addr *remote, const struct in6_addr *local) @@ -302,8 +303,8 @@ out: /** * ip6_tnl_create - create a new tunnel + * @net: network namespace * @p: tunnel parameters - * @pt: pointer to new tunnel * * Description: * Create tunnel matching given parameters. @@ -351,6 +352,7 @@ failed: /** * ip6_tnl_locate - find or create tunnel matching given parameters + * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 0d964160a9dd..fac01b80a104 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -491,13 +491,16 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) } dst_hold(dst); - dst = xfrm_lookup(t->net, dst, fl, NULL, 0); + dst = xfrm_lookup_route(t->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } + if (dst->flags & DST_XFRM_QUEUE) + goto queued; + x = dst->xfrm; if (!vti6_state_check(x, &t->parms.raddr, &t->parms.laddr)) goto tx_err_link_failure; @@ -533,6 +536,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) goto tx_err_dst_release; } +queued: skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; @@ -1219,6 +1223,33 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { .priority = 100, }; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) +static int vti6_rcv_tunnel(struct sk_buff *skb) +{ + const xfrm_address_t *saddr; + __be32 spi; + + saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; + spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); + + return vti6_input_proto(skb, IPPROTO_IPV6, spi, 0); +} + +static struct xfrm6_tunnel vti_ipv6_handler __read_mostly = { + .handler = vti6_rcv_tunnel, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 0, +}; + +static struct xfrm6_tunnel vti_ip6ip_handler __read_mostly = { + .handler = vti6_rcv_tunnel, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 0, +}; +#endif + /** * vti6_tunnel_init - register protocol and reserve needed resources * @@ -1244,6 +1275,15 @@ static int __init vti6_tunnel_init(void) err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + msg = "ipv6 tunnel"; + err = xfrm6_tunnel_register(&vti_ipv6_handler, AF_INET6); + if (err < 0) + goto vti_tunnel_ipv6_failed; + err = xfrm6_tunnel_register(&vti_ip6ip_handler, AF_INET); + if (err < 0) + goto vti_tunnel_ip6ip_failed; +#endif msg = "netlink interface"; err = rtnl_link_register(&vti6_link_ops); @@ -1253,6 +1293,12 @@ static int __init vti6_tunnel_init(void) return 0; rtnl_link_failed: +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + err = xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET); +vti_tunnel_ip6ip_failed: + err = xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6); +vti_tunnel_ipv6_failed: +#endif xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); @@ -1271,6 +1317,10 @@ pernet_dev_failed: static void __exit vti6_tunnel_cleanup(void) { rtnl_link_unregister(&vti6_link_ops); +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET); + xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6); +#endif xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 1f4d20e97c07..06b0d2c329b9 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1629,7 +1629,8 @@ EXPORT_SYMBOL(mroute6_is_socket); * MOSPF/PIM router set up we can clean this up. */ -int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) +int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, + unsigned int optlen) { int ret, parent = 0; struct mif6ctl vif; @@ -1665,7 +1666,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_ADD_MIF: if (optlen < sizeof(vif)) return -EINVAL; - if (copy_from_user(&vif, optval, sizeof(vif))) + if (copy_from_sockptr(&vif, optval, sizeof(vif))) return -EFAULT; if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; @@ -1678,7 +1679,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_DEL_MIF: if (optlen < sizeof(mifi_t)) return -EINVAL; - if (copy_from_user(&mifi, optval, sizeof(mifi_t))) + if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); ret = mif6_delete(mrt, mifi, 0, NULL); @@ -1697,7 +1698,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; - if (copy_from_user(&mfc, optval, sizeof(mfc))) + if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) return -EFAULT; if (parent == 0) parent = mfc.mf6cc_parent; @@ -1718,7 +1719,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (optlen != sizeof(flags)) return -EINVAL; - if (get_user(flags, (int __user *)optval)) + if (copy_from_sockptr(&flags, optval, sizeof(flags))) return -EFAULT; rtnl_lock(); mroute_clean_tables(mrt, flags); @@ -1735,7 +1736,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (optlen != sizeof(v)) return -EINVAL; - if (get_user(v, (int __user *)optval)) + if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; mrt->mroute_do_assert = v; return 0; @@ -1748,7 +1749,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (optlen != sizeof(v)) return -EINVAL; - if (get_user(v, (int __user *)optval)) + if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; v = !!v; rtnl_lock(); @@ -1769,7 +1770,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (optlen != sizeof(u32)) return -EINVAL; - if (get_user(v, (u32 __user *)optval)) + if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 99668bfebd85..daef890460b7 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -91,6 +91,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) t->props.mode = x->props.mode; memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); memcpy(&t->mark, &x->mark, sizeof(t->mark)); + t->if_id = x->if_id; if (xfrm_init_state(t)) goto error; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 76f9e41859a2..43a894bf9a1b 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -136,13 +136,42 @@ static bool setsockopt_needs_rtnl(int optname) return false; } +static int copy_group_source_from_sockptr(struct group_source_req *greqs, + sockptr_t optval, int optlen) +{ + if (in_compat_syscall()) { + struct compat_group_source_req gr32; + + if (optlen < sizeof(gr32)) + return -EINVAL; + if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) + return -EFAULT; + greqs->gsr_interface = gr32.gsr_interface; + greqs->gsr_group = gr32.gsr_group; + greqs->gsr_source = gr32.gsr_source; + } else { + if (optlen < sizeof(*greqs)) + return -EINVAL; + if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) + return -EFAULT; + } + + return 0; +} + static int do_ipv6_mcast_group_source(struct sock *sk, int optname, - struct group_source_req *greqs) + sockptr_t optval, int optlen) { + struct group_source_req greqs; int omode, add; + int ret; + + ret = copy_group_source_from_sockptr(&greqs, optval, optlen); + if (ret) + return ret; - if (greqs->gsr_group.ss_family != AF_INET6 || - greqs->gsr_source.ss_family != AF_INET6) + if (greqs.gsr_group.ss_family != AF_INET6 || + greqs.gsr_source.ss_family != AF_INET6) return -EADDRNOTAVAIL; if (optname == MCAST_BLOCK_SOURCE) { @@ -155,8 +184,8 @@ static int do_ipv6_mcast_group_source(struct sock *sk, int optname, struct sockaddr_in6 *psin6; int retv; - psin6 = (struct sockaddr_in6 *)&greqs->gsr_group; - retv = ipv6_sock_mc_join_ssm(sk, greqs->gsr_interface, + psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; + retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, &psin6->sin6_addr, MCAST_INCLUDE); /* prior join w/ different source is ok */ @@ -168,11 +197,200 @@ static int do_ipv6_mcast_group_source(struct sock *sk, int optname, omode = MCAST_INCLUDE; add = 0; } - return ip6_mc_source(add, omode, sk, greqs); + return ip6_mc_source(add, omode, sk, &greqs); +} + +static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, + int optlen) +{ + struct group_filter *gsf; + int ret; + + if (optlen < GROUP_FILTER_SIZE(0)) + return -EINVAL; + if (optlen > sysctl_optmem_max) + return -ENOBUFS; + + gsf = memdup_sockptr(optval, optlen); + if (IS_ERR(gsf)) + return PTR_ERR(gsf); + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + ret = -ENOBUFS; + if (gsf->gf_numsrc >= 0x1ffffffU || + gsf->gf_numsrc > sysctl_mld_max_msf) + goto out_free_gsf; + + ret = -EINVAL; + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) + goto out_free_gsf; + + ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist); +out_free_gsf: + kfree(gsf); + return ret; +} + +static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, + int optlen) +{ + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter *gf32; + void *p; + int ret; + int n; + + if (optlen < size0) + return -EINVAL; + if (optlen > sysctl_optmem_max - 4) + return -ENOBUFS; + + p = kmalloc(optlen + 4, GFP_KERNEL); + if (!p) + return -ENOMEM; + + gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + ret = -EFAULT; + if (copy_from_sockptr(gf32, optval, optlen)) + goto out_free_p; + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + ret = -ENOBUFS; + n = gf32->gf_numsrc; + if (n >= 0x1ffffffU || n > sysctl_mld_max_msf) + goto out_free_p; + + ret = -EINVAL; + if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) + goto out_free_p; + + ret = ip6_mc_msfilter(sk, &(struct group_filter){ + .gf_interface = gf32->gf_interface, + .gf_group = gf32->gf_group, + .gf_fmode = gf32->gf_fmode, + .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist); + +out_free_p: + kfree(p); + return ret; +} + +static int ipv6_mcast_join_leave(struct sock *sk, int optname, + sockptr_t optval, int optlen) +{ + struct sockaddr_in6 *psin6; + struct group_req greq; + + if (optlen < sizeof(greq)) + return -EINVAL; + if (copy_from_sockptr(&greq, optval, sizeof(greq))) + return -EFAULT; + + if (greq.gr_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + psin6 = (struct sockaddr_in6 *)&greq.gr_group; + if (optname == MCAST_JOIN_GROUP) + return ipv6_sock_mc_join(sk, greq.gr_interface, + &psin6->sin6_addr); + return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); +} + +static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname, + sockptr_t optval, int optlen) +{ + struct compat_group_req gr32; + struct sockaddr_in6 *psin6; + + if (optlen < sizeof(gr32)) + return -EINVAL; + if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) + return -EFAULT; + + if (gr32.gr_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + psin6 = (struct sockaddr_in6 *)&gr32.gr_group; + if (optname == MCAST_JOIN_GROUP) + return ipv6_sock_mc_join(sk, gr32.gr_interface, + &psin6->sin6_addr); + return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr); +} + +static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, + int optlen) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_opt_hdr *new = NULL; + struct net *net = sock_net(sk); + struct ipv6_txoptions *opt; + int err; + + /* hop-by-hop / destination options are privileged option */ + if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) + return -EPERM; + + /* remove any sticky options header with a zero option + * length, per RFC3542. + */ + if (optlen > 0) { + if (sockptr_is_null(optval)) + return -EINVAL; + if (optlen < sizeof(struct ipv6_opt_hdr) || + optlen & 0x7 || + optlen > 8 * 255) + return -EINVAL; + + new = memdup_sockptr(optval, optlen); + if (IS_ERR(new)) + return PTR_ERR(new); + if (unlikely(ipv6_optlen(new) > optlen)) { + kfree(new); + return -EINVAL; + } + } + + opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); + opt = ipv6_renew_options(sk, opt, optname, new); + kfree(new); + if (IS_ERR(opt)) + return PTR_ERR(opt); + + /* routing header option needs extra check */ + err = -EINVAL; + if (optname == IPV6_RTHDR && opt && opt->srcrt) { + struct ipv6_rt_hdr *rthdr = opt->srcrt; + switch (rthdr->type) { +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case IPV6_SRCRT_TYPE_2: + if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) + goto sticky_done; + break; +#endif + case IPV6_SRCRT_TYPE_4: + { + struct ipv6_sr_hdr *srh = + (struct ipv6_sr_hdr *)opt->srcrt; + + if (!seg6_validate_srh(srh, optlen, false)) + goto sticky_done; + break; + } + default: + goto sticky_done; + } + } + + err = 0; + opt = ipv6_update_options(sk, opt); +sticky_done: + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } + return err; } static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); @@ -180,11 +398,11 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, int retv = -ENOPROTOOPT; bool needs_rtnl = setsockopt_needs_rtnl(optname); - if (!optval) + if (sockptr_is_null(optval)) val = 0; else { if (optlen >= sizeof(int)) { - if (get_user(val, (int __user *) optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else val = 0; @@ -436,82 +654,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: - { - struct ipv6_txoptions *opt; - struct ipv6_opt_hdr *new = NULL; - - /* hop-by-hop / destination options are privileged option */ - retv = -EPERM; - if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) - break; - - /* remove any sticky options header with a zero option - * length, per RFC3542. - */ - if (optlen == 0) - optval = NULL; - else if (!optval) - goto e_inval; - else if (optlen < sizeof(struct ipv6_opt_hdr) || - optlen & 0x7 || optlen > 8 * 255) - goto e_inval; - else { - new = memdup_user(optval, optlen); - if (IS_ERR(new)) { - retv = PTR_ERR(new); - break; - } - if (unlikely(ipv6_optlen(new) > optlen)) { - kfree(new); - goto e_inval; - } - } - - opt = rcu_dereference_protected(np->opt, - lockdep_sock_is_held(sk)); - opt = ipv6_renew_options(sk, opt, optname, new); - kfree(new); - if (IS_ERR(opt)) { - retv = PTR_ERR(opt); - break; - } - - /* routing header option needs extra check */ - retv = -EINVAL; - if (optname == IPV6_RTHDR && opt && opt->srcrt) { - struct ipv6_rt_hdr *rthdr = opt->srcrt; - switch (rthdr->type) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - case IPV6_SRCRT_TYPE_2: - if (rthdr->hdrlen != 2 || - rthdr->segments_left != 1) - goto sticky_done; - - break; -#endif - case IPV6_SRCRT_TYPE_4: - { - struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *) - opt->srcrt; - - if (!seg6_validate_srh(srh, optlen, false)) - goto sticky_done; - break; - } - default: - goto sticky_done; - } - } - - retv = 0; - opt = ipv6_update_options(sk, opt); -sticky_done: - if (opt) { - atomic_sub(opt->tot_len, &sk->sk_omem_alloc); - txopt_put(opt); - } + retv = ipv6_set_opt_hdr(sk, optname, optval, optlen); break; - } case IPV6_PKTINFO: { @@ -519,12 +663,13 @@ sticky_done: if (optlen == 0) goto e_inval; - else if (optlen < sizeof(struct in6_pktinfo) || !optval) + else if (optlen < sizeof(struct in6_pktinfo) || + sockptr_is_null(optval)) goto e_inval; - if (copy_from_user(&pkt, optval, sizeof(struct in6_pktinfo))) { - retv = -EFAULT; - break; + if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) { + retv = -EFAULT; + break; } if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; @@ -565,7 +710,7 @@ sticky_done: refcount_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; - if (copy_from_user(opt+1, optval, optlen)) + if (copy_from_sockptr(opt + 1, optval, optlen)) goto done; msg.msg_controllen = optlen; @@ -687,7 +832,7 @@ done: break; retv = -EFAULT; - if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_ADD_MEMBERSHIP) @@ -705,7 +850,7 @@ done: goto e_inval; retv = -EFAULT; - if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_JOIN_ANYCAST) @@ -723,77 +868,26 @@ done: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: - { - struct group_req greq; - struct sockaddr_in6 *psin6; - - if (optlen < sizeof(struct group_req)) - goto e_inval; - - retv = -EFAULT; - if (copy_from_user(&greq, optval, sizeof(struct group_req))) - break; - if (greq.gr_group.ss_family != AF_INET6) { - retv = -EADDRNOTAVAIL; - break; - } - psin6 = (struct sockaddr_in6 *)&greq.gr_group; - if (optname == MCAST_JOIN_GROUP) - retv = ipv6_sock_mc_join(sk, greq.gr_interface, - &psin6->sin6_addr); + if (in_compat_syscall()) + retv = compat_ipv6_mcast_join_leave(sk, optname, optval, + optlen); else - retv = ipv6_sock_mc_drop(sk, greq.gr_interface, - &psin6->sin6_addr); + retv = ipv6_mcast_join_leave(sk, optname, optval, + optlen); break; - } case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: - { - struct group_source_req greqs; - - if (optlen < sizeof(struct group_source_req)) - goto e_inval; - if (copy_from_user(&greqs, optval, sizeof(greqs))) { - retv = -EFAULT; - break; - } - retv = do_ipv6_mcast_group_source(sk, optname, &greqs); + retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen); break; - } case MCAST_MSFILTER: - { - struct group_filter *gsf; - - if (optlen < GROUP_FILTER_SIZE(0)) - goto e_inval; - if (optlen > sysctl_optmem_max) { - retv = -ENOBUFS; - break; - } - gsf = memdup_user(optval, optlen); - if (IS_ERR(gsf)) { - retv = PTR_ERR(gsf); - break; - } - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (gsf->gf_numsrc >= 0x1ffffffU || - gsf->gf_numsrc > sysctl_mld_max_msf) { - kfree(gsf); - retv = -ENOBUFS; - break; - } - if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { - kfree(gsf); - retv = -EINVAL; - break; - } - retv = ip6_mc_msfilter(sk, gsf, gsf->gf_slist); - kfree(gsf); - + if (in_compat_syscall()) + retv = compat_ipv6_set_mcast_msfilter(sk, optval, + optlen); + else + retv = ipv6_set_mcast_msfilter(sk, optval, optlen); break; - } case IPV6_ROUTER_ALERT: if (optlen < sizeof(int)) goto e_inval; @@ -872,6 +966,14 @@ done: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; + case IPV6_RECVERR_RFC4884: + if (optlen < sizeof(int)) + goto e_inval; + if (val < 0 || val > 1) + goto e_inval; + np->recverr_rfc4884 = valbool; + retv = 0; + break; } release_sock(sk); @@ -887,8 +989,8 @@ e_inval: return -EINVAL; } -int ipv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { int err; @@ -909,140 +1011,6 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL(ipv6_setsockopt); -#ifdef CONFIG_COMPAT -int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - int err; - - if (level == SOL_IP && sk->sk_type != SOCK_RAW) { - if (udp_prot.compat_setsockopt != NULL) - return udp_prot.compat_setsockopt(sk, level, optname, - optval, optlen); - return udp_prot.setsockopt(sk, level, optname, optval, optlen); - } - - if (level != SOL_IPV6) - return -ENOPROTOOPT; - - switch (optname) { - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - { - struct compat_group_req __user *gr32 = (void __user *)optval; - struct group_req greq; - struct sockaddr_in6 *psin6 = (struct sockaddr_in6 *)&greq.gr_group; - - if (optlen < sizeof(struct compat_group_req)) - return -EINVAL; - - if (get_user(greq.gr_interface, &gr32->gr_interface) || - copy_from_user(&greq.gr_group, &gr32->gr_group, - sizeof(greq.gr_group))) - return -EFAULT; - - if (greq.gr_group.ss_family != AF_INET6) - return -EADDRNOTAVAIL; - - rtnl_lock(); - lock_sock(sk); - if (optname == MCAST_JOIN_GROUP) - err = ipv6_sock_mc_join(sk, greq.gr_interface, - &psin6->sin6_addr); - else - err = ipv6_sock_mc_drop(sk, greq.gr_interface, - &psin6->sin6_addr); - release_sock(sk); - rtnl_unlock(); - return err; - } - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - { - struct compat_group_source_req __user *gsr32 = (void __user *)optval; - struct group_source_req greqs; - - if (optlen < sizeof(struct compat_group_source_req)) - return -EINVAL; - - if (get_user(greqs.gsr_interface, &gsr32->gsr_interface) || - copy_from_user(&greqs.gsr_group, &gsr32->gsr_group, - sizeof(greqs.gsr_group)) || - copy_from_user(&greqs.gsr_source, &gsr32->gsr_source, - sizeof(greqs.gsr_source))) - return -EFAULT; - - rtnl_lock(); - lock_sock(sk); - err = do_ipv6_mcast_group_source(sk, optname, &greqs); - release_sock(sk); - rtnl_unlock(); - return err; - } - case MCAST_MSFILTER: - { - const int size0 = offsetof(struct compat_group_filter, gf_slist); - struct compat_group_filter *gf32; - void *p; - int n; - - if (optlen < size0) - return -EINVAL; - if (optlen > sysctl_optmem_max - 4) - return -ENOBUFS; - - p = kmalloc(optlen + 4, GFP_KERNEL); - if (!p) - return -ENOMEM; - - gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ - if (copy_from_user(gf32, optval, optlen)) { - err = -EFAULT; - goto mc_msf_out; - } - - n = gf32->gf_numsrc; - /* numsrc >= (4G-140)/128 overflow in 32 bits */ - if (n >= 0x1ffffffU || - n > sysctl_mld_max_msf) { - err = -ENOBUFS; - goto mc_msf_out; - } - if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) { - err = -EINVAL; - goto mc_msf_out; - } - - rtnl_lock(); - lock_sock(sk); - err = ip6_mc_msfilter(sk, &(struct group_filter){ - .gf_interface = gf32->gf_interface, - .gf_group = gf32->gf_group, - .gf_fmode = gf32->gf_fmode, - .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist); - release_sock(sk); - rtnl_unlock(); -mc_msf_out: - kfree(p); - return err; - } - } - - err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); -#ifdef CONFIG_NETFILTER - /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && - optname != IPV6_XFRM_POLICY) - err = compat_nf_setsockopt(sk, PF_INET6, optname, optval, - optlen); -#endif - return err; -} -EXPORT_SYMBOL(compat_ipv6_setsockopt); -#endif - static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, int optname, char __user *optval, int len) { @@ -1077,6 +1045,75 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, return len; } +static int ipv6_get_msfilter(struct sock *sk, void __user *optval, + int __user *optlen, int len) +{ + const int size0 = offsetof(struct group_filter, gf_slist); + struct group_filter __user *p = optval; + struct group_filter gsf; + int num; + int err; + + if (len < size0) + return -EINVAL; + if (copy_from_user(&gsf, p, size0)) + return -EFAULT; + if (gsf.gf_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + num = gsf.gf_numsrc; + lock_sock(sk); + err = ip6_mc_msfget(sk, &gsf, p->gf_slist); + if (!err) { + if (num > gsf.gf_numsrc) + num = gsf.gf_numsrc; + if (put_user(GROUP_FILTER_SIZE(num), optlen) || + copy_to_user(p, &gsf, size0)) + err = -EFAULT; + } + release_sock(sk); + return err; +} + +static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval, + int __user *optlen) +{ + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter __user *p = optval; + struct compat_group_filter gf32; + struct group_filter gf; + int len, err; + int num; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < size0) + return -EINVAL; + + if (copy_from_user(&gf32, p, size0)) + return -EFAULT; + gf.gf_interface = gf32.gf_interface; + gf.gf_fmode = gf32.gf_fmode; + num = gf.gf_numsrc = gf32.gf_numsrc; + gf.gf_group = gf32.gf_group; + + if (gf.gf_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + + lock_sock(sk); + err = ip6_mc_msfget(sk, &gf, p->gf_slist); + release_sock(sk); + if (err) + return err; + if (num > gf.gf_numsrc) + num = gf.gf_numsrc; + len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); + if (put_user(len, optlen) || + put_user(gf.gf_fmode, &p->gf_fmode) || + put_user(gf.gf_numsrc, &p->gf_numsrc)) + return -EFAULT; + return 0; +} + static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen, unsigned int flags) { @@ -1100,33 +1137,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = sk->sk_family; break; case MCAST_MSFILTER: - { - struct group_filter __user *p = (void __user *)optval; - struct group_filter gsf; - const int size0 = offsetof(struct group_filter, gf_slist); - int num; - int err; - - if (len < size0) - return -EINVAL; - if (copy_from_user(&gsf, p, size0)) - return -EFAULT; - if (gsf.gf_group.ss_family != AF_INET6) - return -EADDRNOTAVAIL; - num = gsf.gf_numsrc; - lock_sock(sk); - err = ip6_mc_msfget(sk, &gsf, p->gf_slist); - if (!err) { - if (num > gsf.gf_numsrc) - num = gsf.gf_numsrc; - if (put_user(GROUP_FILTER_SIZE(num), optlen) || - copy_to_user(p, &gsf, size0)) - err = -EFAULT; - } - release_sock(sk); - return err; - } - + if (in_compat_syscall()) + return compat_ipv6_get_msfilter(sk, optval, optlen); + return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { struct msghdr msg; @@ -1435,6 +1448,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->rtalert_isolate; break; + case IPV6_RECVERR_RFC4884: + val = np->recverr_rfc4884; + break; + default: return -ENOPROTOOPT; } @@ -1474,78 +1491,3 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, return err; } EXPORT_SYMBOL(ipv6_getsockopt); - -#ifdef CONFIG_COMPAT -int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - int err; - - if (level == SOL_IP && sk->sk_type != SOCK_RAW) { - if (udp_prot.compat_getsockopt != NULL) - return udp_prot.compat_getsockopt(sk, level, optname, - optval, optlen); - return udp_prot.getsockopt(sk, level, optname, optval, optlen); - } - - if (level != SOL_IPV6) - return -ENOPROTOOPT; - - if (optname == MCAST_MSFILTER) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); - struct compat_group_filter __user *p = (void __user *)optval; - struct compat_group_filter gf32; - struct group_filter gf; - int ulen, err; - int num; - - if (get_user(ulen, optlen)) - return -EFAULT; - - if (ulen < size0) - return -EINVAL; - - if (copy_from_user(&gf32, p, size0)) - return -EFAULT; - - gf.gf_interface = gf32.gf_interface; - gf.gf_fmode = gf32.gf_fmode; - num = gf.gf_numsrc = gf32.gf_numsrc; - gf.gf_group = gf32.gf_group; - - if (gf.gf_group.ss_family != AF_INET6) - return -EADDRNOTAVAIL; - lock_sock(sk); - err = ip6_mc_msfget(sk, &gf, p->gf_slist); - release_sock(sk); - if (err) - return err; - if (num > gf.gf_numsrc) - num = gf.gf_numsrc; - ulen = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); - if (put_user(ulen, optlen) || - put_user(gf.gf_fmode, &p->gf_fmode) || - put_user(gf.gf_numsrc, &p->gf_numsrc)) - return -EFAULT; - return 0; - } - - err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, - MSG_CMSG_COMPAT); -#ifdef CONFIG_NETFILTER - /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { - int len; - - if (get_user(len, optlen)) - return -EFAULT; - - err = compat_nf_getsockopt(sk, PF_INET6, optname, optval, &len); - if (err >= 0) - err = put_user(len, optlen); - } -#endif - return err; -} -EXPORT_SYMBOL(compat_ipv6_getsockopt); -#endif diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e96a431549bc..2e2119bfcf13 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -960,8 +960,7 @@ static int compat_table_info(const struct xt_table_info *info, } #endif -static int get_info(struct net *net, void __user *user, - const int *len, int compat) +static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -975,7 +974,7 @@ static int get_info(struct net *net, void __user *user, name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_lock(AF_INET6); #endif t = xt_request_find_table_lock(net, AF_INET6, name); @@ -985,7 +984,7 @@ static int get_info(struct net *net, void __user *user, #ifdef CONFIG_COMPAT struct xt_table_info tmp; - if (compat) { + if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET6); private = &tmp; @@ -1011,7 +1010,7 @@ static int get_info(struct net *net, void __user *user, } else ret = PTR_ERR(t); #ifdef CONFIG_COMPAT - if (compat) + if (in_compat_syscall()) xt_compat_unlock(AF_INET6); #endif return ret; @@ -1120,7 +1119,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, } static int -do_replace(struct net *net, const void __user *user, unsigned int len) +do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct ip6t_replace tmp; @@ -1128,7 +1127,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1144,8 +1143,8 @@ do_replace(struct net *net, const void __user *user, unsigned int len) return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1169,8 +1168,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) } static int -do_add_counters(struct net *net, const void __user *user, unsigned int len, - int compat) +do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; @@ -1181,7 +1179,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, struct ip6t_entry *iter; unsigned int addend; - paddc = xt_copy_counters_from_user(user, len, &tmp, compat); + paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET6, tmp.name); @@ -1495,7 +1493,7 @@ out_unlock: } static int -compat_do_replace(struct net *net, void __user *user, unsigned int len) +compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_ip6t_replace tmp; @@ -1503,7 +1501,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ @@ -1519,8 +1517,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; loc_cpu_entry = newinfo->entries; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), + tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } @@ -1543,31 +1541,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return ret; } -static int -compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, - unsigned int len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case IP6T_SO_SET_REPLACE: - ret = compat_do_replace(sock_net(sk), user, len); - break; - - case IP6T_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 1); - break; - - default: - ret = -EINVAL; - } - - return ret; -} - struct compat_ip6t_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; @@ -1643,33 +1616,10 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_unlock(AF_INET6); return ret; } - -static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *); - -static int -compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - int ret; - - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - switch (cmd) { - case IP6T_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 1); - break; - case IP6T_SO_GET_ENTRIES: - ret = compat_get_entries(sock_net(sk), user, len); - break; - default: - ret = do_ip6t_get_ctl(sk, cmd, user, len); - } - return ret; -} #endif static int -do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) { int ret; @@ -1678,11 +1628,16 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) switch (cmd) { case IP6T_SO_SET_REPLACE: - ret = do_replace(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_do_replace(sock_net(sk), arg, len); + else +#endif + ret = do_replace(sock_net(sk), arg, len); break; case IP6T_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sock_net(sk), user, len, 0); + ret = do_add_counters(sock_net(sk), arg, len); break; default: @@ -1702,11 +1657,16 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) switch (cmd) { case IP6T_SO_GET_INFO: - ret = get_info(sock_net(sk), user, len, 0); + ret = get_info(sock_net(sk), user, len); break; case IP6T_SO_GET_ENTRIES: - ret = get_entries(sock_net(sk), user, len); +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = compat_get_entries(sock_net(sk), user, len); + else +#endif + ret = get_entries(sock_net(sk), user, len); break; case IP6T_SO_GET_REVISION_MATCH: @@ -1897,15 +1857,9 @@ static struct nf_sockopt_ops ip6t_sockopts = { .set_optmin = IP6T_BASE_CTL, .set_optmax = IP6T_SO_SET_MAX+1, .set = do_ip6t_set_ctl, -#ifdef CONFIG_COMPAT - .compat_set = compat_do_ip6t_set_ctl, -#endif .get_optmin = IP6T_BASE_CTL, .get_optmax = IP6T_SO_GET_MAX+1, .get = do_ip6t_get_ctl, -#ifdef CONFIG_COMPAT - .compat_get = compat_do_ip6t_get_ctl, -#endif .owner = THIS_MODULE, }; diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index 4e15a14435e4..70da2f2ce064 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -74,8 +74,7 @@ static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par) ahinfo->hdrres, ah->reserved, !(ahinfo->hdrres && ah->reserved)); - return (ah != NULL) && - spi_match(ahinfo->spis[0], ahinfo->spis[1], + return spi_match(ahinfo->spis[0], ahinfo->spis[1], ntohl(ah->spi), !!(ahinfo->invflags & IP6T_AH_INV_SPI)) && (!ahinfo->hdrlen || diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c index fb91eeee4a1e..3aad6439386b 100644 --- a/net/ipv6/netfilter/ip6t_frag.c +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -85,8 +85,7 @@ frag_mt6(const struct sk_buff *skb, struct xt_action_param *par) !((fraginfo->flags & IP6T_FRAG_NMF) && (ntohs(fh->frag_off) & IP6_MF))); - return (fh != NULL) && - id_match(fraginfo->ids[0], fraginfo->ids[1], + return id_match(fraginfo->ids[0], fraginfo->ids[1], ntohl(fh->identification), !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) && !((fraginfo->flags & IP6T_FRAG_RES) && diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c index 467b2a86031b..e7a3fb9355ee 100644 --- a/net/ipv6/netfilter/ip6t_hbh.c +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -86,8 +86,7 @@ hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par) ((optinfo->hdrlen == hdrlen) ^ !!(optinfo->invflags & IP6T_OPTS_INV_LEN)))); - ret = (oh != NULL) && - (!(optinfo->flags & IP6T_OPTS_LEN) || + ret = (!(optinfo->flags & IP6T_OPTS_LEN) || ((optinfo->hdrlen == hdrlen) ^ !!(optinfo->invflags & IP6T_OPTS_INV_LEN))); diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index f633dc84ca3f..733c83d38b30 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -89,8 +89,7 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) !((rtinfo->flags & IP6T_RT_RES) && (((const struct rt0_hdr *)rh)->reserved))); - ret = (rh != NULL) && - (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], + ret = (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], rh->segments_left, !!(rtinfo->invflags & IP6T_RT_INV_SGS))) && (!(rtinfo->flags & IP6T_RT_LEN) || diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 5fae66f66671..4aef6baaa55e 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -126,6 +126,21 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, } EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); +static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) +{ + struct dst_entry *dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(struct flowi)); + fl.u.ip6.daddr = ipv6_hdr(skb_in)->saddr; + nf_ip6_route(dev_net(skb_in->dev), &dst, &fl, false); + if (!dst) + return -1; + + skb_dst_set(skb_in, dst); + return 0; +} + void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) { struct net_device *br_indev __maybe_unused; @@ -154,6 +169,14 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.daddr = oip6h->saddr; fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; + + if (hook == NF_INET_PRE_ROUTING) { + nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); + if (!dst) + return; + skb_dst_set(oldskb, dst); + } + fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); @@ -245,6 +268,9 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = net->loopback_dev; + if (hooknum == NF_INET_PRE_ROUTING && nf_reject6_fill_skb_dst(skb_in)) + return; + icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); } EXPORT_SYMBOL_GPL(nf_send_unreach6); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 98ac32b49d8c..6caa062f68e7 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -114,6 +114,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); ipcm6_init_sk(&ipc6, np); + ipc6.sockc.mark = sk->sk_mark; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8ef5a7b30524..874f01cd7aec 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -972,13 +972,13 @@ do_confirm: } static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + sockptr_t optval, int optlen) { switch (optname) { case ICMPV6_FILTER: if (optlen > sizeof(struct icmp6_filter)) optlen = sizeof(struct icmp6_filter); - if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen)) + if (copy_from_sockptr(&raw6_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; default: @@ -1015,12 +1015,12 @@ static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct raw6_sock *rp = raw6_sk(sk); int val; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { @@ -1062,7 +1062,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, } static int rawv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { switch (level) { case SOL_RAW: @@ -1084,30 +1084,6 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, return do_rawv6_setsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - switch (level) { - case SOL_RAW: - break; - case SOL_ICMPV6: - if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) - return -EOPNOTSUPP; - return rawv6_seticmpfilter(sk, level, optname, optval, optlen); - case SOL_IPV6: - if (optname == IPV6_CHECKSUM || - optname == IPV6_HDRINCL) - break; - fallthrough; - default: - return compat_ipv6_setsockopt(sk, level, optname, - optval, optlen); - } - return do_rawv6_setsockopt(sk, level, optname, optval, optlen); -} -#endif - static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -1169,30 +1145,6 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, return do_rawv6_getsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - switch (level) { - case SOL_RAW: - break; - case SOL_ICMPV6: - if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) - return -EOPNOTSUPP; - return rawv6_geticmpfilter(sk, level, optname, optval, optlen); - case SOL_IPV6: - if (optname == IPV6_CHECKSUM || - optname == IPV6_HDRINCL) - break; - fallthrough; - default: - return compat_ipv6_getsockopt(sk, level, optname, - optval, optlen); - } - return do_rawv6_getsockopt(sk, level, optname, optval, optlen); -} -#endif - static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) { switch (cmd) { @@ -1297,8 +1249,6 @@ struct proto rawv6_prot = { .usersize = sizeof_field(struct raw6_sock, filter), .h.raw_hash = &raw_v6_hashinfo, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_rawv6_setsockopt, - .compat_getsockopt = compat_rawv6_getsockopt, .compat_ioctl = compat_rawv6_ioctl, #endif .diag_destroy = raw_abort, @@ -1378,8 +1328,6 @@ const struct proto_ops inet6_sockraw_ops = { .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4c36bd0c7930..5e7e25e2523a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -61,6 +61,7 @@ #include <net/l3mdev.h> #include <net/ip.h> #include <linux/uaccess.h> +#include <linux/btf_ids.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -1210,7 +1211,7 @@ fallback: return nrt; } -static struct rt6_info *ip6_pol_route_lookup(struct net *net, +INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, @@ -2277,7 +2278,7 @@ out: } EXPORT_SYMBOL_GPL(ip6_pol_route); -static struct rt6_info *ip6_pol_route_input(struct net *net, +INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, @@ -2468,7 +2469,7 @@ void ip6_route_input(struct sk_buff *skb) &fl6, skb, flags)); } -static struct rt6_info *ip6_pol_route_output(struct net *net, +INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, @@ -2915,7 +2916,7 @@ struct ip6rd_flowi { struct in6_addr gateway; }; -static struct rt6_info *__ip6_route_redirect(struct net *net, +INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, @@ -6423,21 +6424,29 @@ void __init ip6_route_init_special_entries(void) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) -static const struct bpf_iter_reg ipv6_route_reg_info = { - .target = "ipv6_route", +BTF_ID_LIST(btf_fib6_info_id) +BTF_ID(struct, fib6_info) + +static const struct bpf_iter_seq_info ipv6_route_seq_info = { .seq_ops = &ipv6_route_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct ipv6_route_iter), +}; + +static struct bpf_iter_reg ipv6_route_reg_info = { + .target = "ipv6_route", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__ipv6_route, rt), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &ipv6_route_seq_info, }; static int __init bpf_iter_register(void) { + ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; return bpf_iter_reg_target(&ipv6_route_reg_info); } diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index c3ececd7cfc1..5fdf3ebb953f 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -136,8 +136,7 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, oldhdr = ipv6_hdr(skb); - buf = kzalloc(ipv6_rpl_srh_alloc_size(srh->segments_left - 1) * 2, - GFP_ATOMIC); + buf = kcalloc(struct_size(srh, segments.addr, srh->segments_left), 2, GFP_ATOMIC); if (!buf) return -ENOMEM; diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index e0e9f48ab14f..897fa59c47de 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -27,6 +27,23 @@ #include <net/seg6_hmac.h> #endif +static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) +{ + int head = 0; + + switch (tuninfo->mode) { + case SEG6_IPTUN_MODE_INLINE: + break; + case SEG6_IPTUN_MODE_ENCAP: + head = sizeof(struct ipv6hdr); + break; + case SEG6_IPTUN_MODE_L2ENCAP: + return 0; + } + + return ((tuninfo->srh->hdrlen + 1) << 3) + head; +} + struct seg6_lwt { struct dst_cache cache; struct seg6_iptunnel_encap tuninfo[]; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 13235a012388..e796a64be308 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -170,7 +170,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false); + req = cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb); if (!req) goto out; @@ -178,9 +178,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq = tcp_rsk(req); treq->tfo_listener = false; - if (IS_ENABLED(CONFIG_MPTCP)) - treq->is_mptcp = 0; - if (security_inet_conn_request(sk, skb, req)) goto out_free; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f67d45ff00b4..305870a72352 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -567,7 +567,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, } static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, - char __user *optval, int optlen) + sockptr_t optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; @@ -577,7 +577,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (optlen < sizeof(cmd)) return -EINVAL; - if (copy_from_user(&cmd, optval, sizeof(cmd))) + if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin6->sin6_family != AF_INET6) @@ -1811,6 +1811,13 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_destructor = tcp_twsk_destructor, }; +INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + + __tcp_v6_send_check(skb, &np->saddr, &sk->sk_v6_daddr); +} + const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, @@ -1824,10 +1831,6 @@ const struct inet_connection_sock_af_ops ipv6_specific = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif .mtu_reduced = tcp_v6_mtu_reduced, }; @@ -1854,10 +1857,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif .mtu_reduced = tcp_v4_mtu_reduced, }; @@ -2115,10 +2114,6 @@ struct proto tcpv6_prot = { .rsk_prot = &tcp6_request_sock_ops, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_tcp_setsockopt, - .compat_getsockopt = compat_tcp_getsockopt, -#endif .diag_destroy = tcp_abort, }; EXPORT_SYMBOL_GPL(tcpv6_prot); diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 06c02ebe6b9b..00e8d8b1c9a7 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -155,6 +155,33 @@ drop: return 0; } +#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) +static int tunnel6_rcv_cb(struct sk_buff *skb, u8 proto, int err) +{ + struct xfrm6_tunnel __rcu *head; + struct xfrm6_tunnel *handler; + int ret; + + head = (proto == IPPROTO_IPV6) ? tunnel6_handlers : tunnel46_handlers; + + for_each_tunnel_rcu(head, handler) { + if (handler->cb_handler) { + ret = handler->cb_handler(skb, err); + if (ret <= 0) + return ret; + } + } + + return 0; +} + +static const struct xfrm_input_afinfo tunnel6_input_afinfo = { + .family = AF_INET6, + .is_ipip = true, + .callback = tunnel6_rcv_cb, +}; +#endif + static int tunnel46_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; @@ -245,11 +272,25 @@ static int __init tunnel6_init(void) inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); return -EAGAIN; } +#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) + if (xfrm_input_register_afinfo(&tunnel6_input_afinfo)) { + pr_err("%s: can't add input afinfo\n", __func__); + inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); + inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); + if (xfrm6_tunnel_mpls_supported()) + inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS); + return -EAGAIN; + } +#endif return 0; } static void __exit tunnel6_fini(void) { +#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) + if (xfrm_input_unregister_afinfo(&tunnel6_input_afinfo)) + pr_err("%s: can't remove input afinfo\n", __func__); +#endif if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP)) pr_err("%s: can't remove protocol\n", __func__); if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index a8d74f44056a..29d9691359b9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -141,6 +141,24 @@ static int compute_score(struct sock *sk, struct net *net, return score; } +static struct sock *lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum) +{ + struct sock *reuse_sk = NULL; + u32 hash; + + if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { + hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); + reuse_sk = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + } + return reuse_sk; +} + /* called with rcu_read_lock() */ static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -148,9 +166,8 @@ static struct sock *udp6_lib_lookup2(struct net *net, int dif, int sdif, struct udp_hslot *hslot2, struct sk_buff *skb) { - struct sock *sk, *result, *reuseport_result; + struct sock *sk, *result; int score, badness; - u32 hash = 0; result = NULL; badness = -1; @@ -158,26 +175,44 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - reuseport_result = NULL; + result = lookup_reuseport(net, sk, skb, + saddr, sport, daddr, hnum); + /* Fall back to scoring if group has connections */ + if (result && !reuseport_has_conns(sk, false)) + return result; - if (sk->sk_reuseport && - sk->sk_state != TCP_ESTABLISHED) { - hash = udp6_ehashfn(net, daddr, hnum, - saddr, sport); - - reuseport_result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (reuseport_result && !reuseport_has_conns(sk, false)) - return reuseport_result; - } - - result = reuseport_result ? : sk; + result = result ? : sk; badness = score; } } return result; } +static inline struct sock *udp6_lookup_run_bpf(struct net *net, + struct udp_table *udptable, + struct sk_buff *skb, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + u16 hnum) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (udptable != &udp_table) + return NULL; /* only UDP is supported */ + + no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, + saddr, sport, daddr, hnum, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + /* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -188,25 +223,42 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2; struct udp_hslot *hslot2; - struct sock *result; + struct sock *result, *sk; hash2 = ipv6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; + /* Lookup connected or non-wildcard sockets */ result = udp6_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, hslot2, skb); - if (!result) { - hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); - slot2 = hash2 & udptable->mask; + if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) + goto done; + + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + sk = udp6_lookup_run_bpf(net, udptable, skb, + saddr, sport, daddr, hnum); + if (sk) { + result = sk; + goto done; + } + } - hslot2 = &udptable->hash2[slot2]; + /* Got non-wildcard socket or error on first lookup */ + if (result) + goto done; - result = udp6_lib_lookup2(net, saddr, sport, - &in6addr_any, hnum, dif, sdif, - hslot2, skb); - } + /* Lookup wildcard sockets */ + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + + result = udp6_lib_lookup2(net, saddr, sport, + &in6addr_any, hnum, dif, sdif, + hslot2, skb); +done: if (IS_ERR(result)) return NULL; return result; @@ -1062,6 +1114,9 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, * @sk: socket we are sending on * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) + * @saddr: source address + * @daddr: destination address + * @len: length of packet */ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, const struct in6_addr *saddr, @@ -1561,26 +1616,16 @@ void udpv6_destroy_sock(struct sock *sk) /* * Socket option code for UDP */ -int udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_setsockopt(sk, level, optname, optval, optlen, + return udp_lib_setsockopt(sk, level, optname, + optval, optlen, udp_v6_push_pending_frames); return ipv6_setsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) -{ - if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_setsockopt(sk, level, optname, optval, optlen, - udp_v6_push_pending_frames); - return compat_ipv6_setsockopt(sk, level, optname, optval, optlen); -} -#endif - int udpv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -1589,16 +1634,6 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname, return ipv6_getsockopt(sk, level, optname, optval, optlen); } -#ifdef CONFIG_COMPAT -int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level == SOL_UDP || level == SOL_UDPLITE) - return udp_lib_getsockopt(sk, level, optname, optval, optlen); - return compat_ipv6_getsockopt(sk, level, optname, optval, optlen); -} -#endif - /* thinking of making this const? Don't. * early_demux can change based on sysctl. */ @@ -1681,10 +1716,6 @@ struct proto udpv6_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), .h.udp_table = &udp_table, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udpv6_setsockopt, - .compat_getsockopt = compat_udpv6_getsockopt, -#endif .diag_destroy = udp_abort, }; diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 20e324b6f358..b2fcc46c1630 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -17,14 +17,8 @@ void udp_v6_rehash(struct sock *sk); int udpv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -int udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -#ifdef CONFIG_COMPAT -int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -#endif +int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index bf7a7acd39b1..fbb700d3f437 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -52,10 +52,6 @@ struct proto udplitev6_prot = { .sysctl_mem = sysctl_udp_mem, .obj_size = sizeof(struct udp6_sock), .h.udp_table = &udplite_table, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udpv6_setsockopt, - .compat_getsockopt = compat_udpv6_getsockopt, -#endif }; static struct inet_protosw udplite6_protosw = { diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index ee0add15497d..6ee9851ac7c6 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1494,7 +1494,7 @@ static int iucv_sock_release(struct socket *sock) /* getsockopt and setsockopt */ static int iucv_sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); @@ -1507,7 +1507,7 @@ static int iucv_sock_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *) optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; rc = 0; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 56fac24a627a..56dad9565bc9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1265,7 +1265,7 @@ static void kcm_recv_enable(struct kcm_sock *kcm) } static int kcm_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct kcm_sock *kcm = kcm_sk(sock->sk); int val, valbool; @@ -1277,8 +1277,8 @@ static int kcm_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; valbool = val ? 1 : 0; diff --git a/net/key/af_key.c b/net/key/af_key.c index a915bc86620a..c12dbc51ef5f 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3741,8 +3741,6 @@ static const struct proto_ops pfkey_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 6434d17e6e8e..701fc72ad9f4 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* - * L2TP core. +/* L2TP core. * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd * @@ -94,7 +93,7 @@ struct l2tp_skb_cb { unsigned long expires; }; -#define L2TP_SKB_CB(skb) ((struct l2tp_skb_cb *) &skb->cb[sizeof(struct inet_skb_parm)]) +#define L2TP_SKB_CB(skb) ((struct l2tp_skb_cb *)&(skb)->cb[sizeof(struct inet_skb_parm)]) static struct workqueue_struct *l2tp_wq; @@ -102,8 +101,10 @@ static struct workqueue_struct *l2tp_wq; static unsigned int l2tp_net_id; struct l2tp_net { struct list_head l2tp_tunnel_list; + /* Lock for write access to l2tp_tunnel_list */ spinlock_t l2tp_tunnel_list_lock; struct hlist_head l2tp_session_hlist[L2TP_HASH_SIZE_2]; + /* Lock for write access to l2tp_session_hlist */ spinlock_t l2tp_session_hlist_lock; }; @@ -134,7 +135,6 @@ static inline struct hlist_head * l2tp_session_id_hash_2(struct l2tp_net *pn, u32 session_id) { return &pn->l2tp_session_hlist[hash_32(session_id, L2TP_HASH_BITS_2)]; - } /* Session hash list. @@ -149,12 +149,51 @@ l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id) return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)]; } -void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) +static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) { sock_put(tunnel->sock); /* the tunnel is freed in the socket destructor */ } -EXPORT_SYMBOL(l2tp_tunnel_free); + +static void l2tp_session_free(struct l2tp_session *session) +{ + struct l2tp_tunnel *tunnel = session->tunnel; + + if (tunnel) { + if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC)) + goto out; + l2tp_tunnel_dec_refcount(tunnel); + } + +out: + kfree(session); +} + +void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel) +{ + refcount_inc(&tunnel->ref_count); +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_inc_refcount); + +void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) +{ + if (refcount_dec_and_test(&tunnel->ref_count)) + l2tp_tunnel_free(tunnel); +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_dec_refcount); + +void l2tp_session_inc_refcount(struct l2tp_session *session) +{ + refcount_inc(&session->ref_count); +} +EXPORT_SYMBOL_GPL(l2tp_session_inc_refcount); + +void l2tp_session_dec_refcount(struct l2tp_session *session) +{ + if (refcount_dec_and_test(&session->ref_count)) + l2tp_session_free(session); +} +EXPORT_SYMBOL_GPL(l2tp_session_dec_refcount); /* Lookup a tunnel. A new reference is held on the returned tunnel. */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) @@ -412,7 +451,7 @@ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff * } /* call private receive handler */ - if (session->recv_skb != NULL) + if (session->recv_skb) (*session->recv_skb)(session, skb, L2TP_SKB_CB(skb)->length); else kfree_skb(skb); @@ -621,8 +660,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, int length) { struct l2tp_tunnel *tunnel = session->tunnel; + u32 ns = 0, nr = 0; int offset; - u32 ns, nr; /* Parse and check optional cookie */ if (session->peer_cookie_len > 0) { @@ -644,13 +683,12 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, * the control of the LNS. If no sequence numbers present but * we were expecting them, discard frame. */ - ns = nr = 0; L2TP_SKB_CB(skb)->has_seq = 0; if (tunnel->version == L2TP_HDR_VER_2) { if (hdrflags & L2TP_HDRFLAG_S) { - ns = ntohs(*(__be16 *) ptr); + ns = ntohs(*(__be16 *)ptr); ptr += 2; - nr = ntohs(*(__be16 *) ptr); + nr = ntohs(*(__be16 *)ptr); ptr += 2; /* Store L2TP info in the skb */ @@ -662,7 +700,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, session->name, ns, nr, session->nr); } } else if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) { - u32 l2h = ntohl(*(__be32 *) ptr); + u32 l2h = ntohl(*(__be32 *)ptr); if (l2h & 0x40000000) { ns = l2h & 0x00ffffff; @@ -679,11 +717,11 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, } if (L2TP_SKB_CB(skb)->has_seq) { - /* Received a packet with sequence numbers. If we're the LNS, + /* Received a packet with sequence numbers. If we're the LAC, * check if we sre sending sequence numbers and if not, * configure it so. */ - if ((!session->lns_mode) && (!session->send_seq)) { + if (!session->lns_mode && !session->send_seq) { l2tp_info(session, L2TP_MSG_SEQ, "%s: requested to enable seq numbers by LNS\n", session->name); @@ -707,7 +745,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, * If we're the LNS and we're sending sequence numbers, the * LAC is broken. Discard the frame. */ - if ((!session->lns_mode) && (session->send_seq)) { + if (!session->lns_mode && session->send_seq) { l2tp_info(session, L2TP_MSG_SEQ, "%s: requested to disable seq numbers by LNS\n", session->name); @@ -770,20 +808,21 @@ discard: atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } -EXPORT_SYMBOL(l2tp_recv_common); +EXPORT_SYMBOL_GPL(l2tp_recv_common); /* Drop skbs from the session's reorder_q */ -static int l2tp_session_queue_purge(struct l2tp_session *session) +static void l2tp_session_queue_purge(struct l2tp_session *session) { struct sk_buff *skb = NULL; - BUG_ON(!session); - BUG_ON(session->magic != L2TP_SESSION_MAGIC); + + if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) + return; + while ((skb = skb_dequeue(&session->reorder_q))) { atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } - return 0; } /* Internal UDP receive frame. Do the real work of receiving an L2TP data frame @@ -825,10 +864,11 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) } /* Point to L2TP header */ - optr = ptr = skb->data; + optr = skb->data; + ptr = skb->data; /* Get L2TP header flags */ - hdrflags = ntohs(*(__be16 *) ptr); + hdrflags = ntohs(*(__be16 *)ptr); /* Check protocol version */ version = hdrflags & L2TP_HDR_VER_MASK; @@ -859,14 +899,14 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) ptr += 2; /* Extract tunnel and session ID */ - tunnel_id = ntohs(*(__be16 *) ptr); + tunnel_id = ntohs(*(__be16 *)ptr); ptr += 2; - session_id = ntohs(*(__be16 *) ptr); + session_id = ntohs(*(__be16 *)ptr); ptr += 2; } else { ptr += 2; /* skip reserved bits */ tunnel_id = tunnel->tunnel_id; - session_id = ntohl(*(__be32 *) ptr); + session_id = ntohl(*(__be32 *)ptr); ptr += 4; } @@ -910,7 +950,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) struct l2tp_tunnel *tunnel; tunnel = rcu_dereference_sk_user_data(sk); - if (tunnel == NULL) + if (!tunnel) goto pass_up; l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n", @@ -971,13 +1011,13 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) */ if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { u16 flags = L2TP_HDR_VER_3; - *((__be16 *) bufp) = htons(flags); + *((__be16 *)bufp) = htons(flags); bufp += 2; - *((__be16 *) bufp) = 0; + *((__be16 *)bufp) = 0; bufp += 2; } - *((__be32 *) bufp) = htonl(session->peer_session_id); + *((__be32 *)bufp) = htonl(session->peer_session_id); bufp += 4; if (session->cookie_len) { memcpy(bufp, &session->cookie[0], session->cookie_len); @@ -1076,7 +1116,10 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len } /* Setup L2TP header */ - session->build_header(session, __skb_push(skb, hdr_len)); + if (tunnel->version == L2TP_HDR_VER_2) + l2tp_build_l2tpv2_header(session, __skb_push(skb, hdr_len)); + else + l2tp_build_l2tpv3_header(session, __skb_push(skb, hdr_len)); /* Reset skb netfilter state */ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1121,8 +1164,8 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len &sk->sk_v6_daddr, udp_len); else #endif - udp_set_csum(sk->sk_no_check_tx, skb, inet->inet_saddr, - inet->inet_daddr, udp_len); + udp_set_csum(sk->sk_no_check_tx, skb, inet->inet_saddr, + inet->inet_daddr, udp_len); break; case L2TP_ENCAPTYPE_IP: @@ -1149,7 +1192,7 @@ static void l2tp_tunnel_destruct(struct sock *sk) { struct l2tp_tunnel *tunnel = l2tp_tunnel(sk); - if (tunnel == NULL) + if (!tunnel) goto end; l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name); @@ -1179,6 +1222,30 @@ end: return; } +/* Remove an l2tp session from l2tp_core's hash lists. */ +static void l2tp_session_unhash(struct l2tp_session *session) +{ + struct l2tp_tunnel *tunnel = session->tunnel; + + /* Remove the session from core hashes */ + if (tunnel) { + /* Remove from the per-tunnel hash */ + write_lock_bh(&tunnel->hlist_lock); + hlist_del_init(&session->hlist); + write_unlock_bh(&tunnel->hlist_lock); + + /* For L2TPv3 we have a per-net hash: remove from there, too */ + if (tunnel->version != L2TP_HDR_VER_2) { + struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); + + spin_lock_bh(&pn->l2tp_session_hlist_lock); + hlist_del_init_rcu(&session->global_hlist); + spin_unlock_bh(&pn->l2tp_session_hlist_lock); + synchronize_rcu(); + } + } +} + /* When the tunnel is closed, all the attached sessions need to go too. */ static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) @@ -1188,8 +1255,6 @@ static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) struct hlist_node *tmp; struct l2tp_session *session; - BUG_ON(tunnel == NULL); - l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing all sessions...\n", tunnel->name); @@ -1210,10 +1275,10 @@ again: write_unlock_bh(&tunnel->hlist_lock); - __l2tp_session_unhash(session); + l2tp_session_unhash(session); l2tp_session_queue_purge(session); - if (session->session_close != NULL) + if (session->session_close) (*session->session_close)(session); l2tp_session_dec_refcount(session); @@ -1284,10 +1349,10 @@ static void l2tp_tunnel_del_work(struct work_struct *work) * exit hook. */ static int l2tp_tunnel_sock_create(struct net *net, - u32 tunnel_id, - u32 peer_tunnel_id, - struct l2tp_tunnel_cfg *cfg, - struct socket **sockp) + u32 tunnel_id, + u32 peer_tunnel_id, + struct l2tp_tunnel_cfg *cfg, + struct socket **sockp) { int err = -EINVAL; struct socket *sock = NULL; @@ -1305,9 +1370,9 @@ static int l2tp_tunnel_sock_create(struct net *net, memcpy(&udp_conf.peer_ip6, cfg->peer_ip6, sizeof(udp_conf.peer_ip6)); udp_conf.use_udp6_tx_checksums = - ! cfg->udp6_zero_tx_checksums; + !cfg->udp6_zero_tx_checksums; udp_conf.use_udp6_rx_checksums = - ! cfg->udp6_zero_rx_checksums; + !cfg->udp6_zero_rx_checksums; } else #endif { @@ -1332,7 +1397,7 @@ static int l2tp_tunnel_sock_create(struct net *net, struct sockaddr_l2tpip6 ip6_addr = {0}; err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, - IPPROTO_L2TP, &sock); + IPPROTO_L2TP, &sock); if (err < 0) goto out; @@ -1340,7 +1405,7 @@ static int l2tp_tunnel_sock_create(struct net *net, memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = tunnel_id; - err = kernel_bind(sock, (struct sockaddr *) &ip6_addr, + err = kernel_bind(sock, (struct sockaddr *)&ip6_addr, sizeof(ip6_addr)); if (err < 0) goto out; @@ -1350,7 +1415,7 @@ static int l2tp_tunnel_sock_create(struct net *net, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = peer_tunnel_id; err = kernel_connect(sock, - (struct sockaddr *) &ip6_addr, + (struct sockaddr *)&ip6_addr, sizeof(ip6_addr), 0); if (err < 0) goto out; @@ -1360,14 +1425,14 @@ static int l2tp_tunnel_sock_create(struct net *net, struct sockaddr_l2tpip ip_addr = {0}; err = sock_create_kern(net, AF_INET, SOCK_DGRAM, - IPPROTO_L2TP, &sock); + IPPROTO_L2TP, &sock); if (err < 0) goto out; ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; - err = kernel_bind(sock, (struct sockaddr *) &ip_addr, + err = kernel_bind(sock, (struct sockaddr *)&ip_addr, sizeof(ip_addr)); if (err < 0) goto out; @@ -1375,7 +1440,7 @@ static int l2tp_tunnel_sock_create(struct net *net, ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->peer_ip; ip_addr.l2tp_conn_id = peer_tunnel_id; - err = kernel_connect(sock, (struct sockaddr *) &ip_addr, + err = kernel_connect(sock, (struct sockaddr *)&ip_addr, sizeof(ip_addr), 0); if (err < 0) goto out; @@ -1388,7 +1453,7 @@ static int l2tp_tunnel_sock_create(struct net *net, out: *sockp = sock; - if ((err < 0) && sock) { + if (err < 0 && sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); *sockp = NULL; @@ -1399,17 +1464,18 @@ out: static struct lock_class_key l2tp_socket_class; -int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp) +int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, + struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp) { struct l2tp_tunnel *tunnel = NULL; int err; enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP; - if (cfg != NULL) + if (cfg) encap = cfg->encap; - tunnel = kzalloc(sizeof(struct l2tp_tunnel), GFP_KERNEL); - if (tunnel == NULL) { + tunnel = kzalloc(sizeof(*tunnel), GFP_KERNEL); + if (!tunnel) { err = -ENOMEM; goto err; } @@ -1424,7 +1490,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 rwlock_init(&tunnel->hlist_lock); tunnel->acpt_newsess = true; - if (cfg != NULL) + if (cfg) tunnel->debug = cfg->debug; tunnel->encap = encap; @@ -1557,67 +1623,17 @@ void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); -/* Really kill the session. - */ -void l2tp_session_free(struct l2tp_session *session) -{ - struct l2tp_tunnel *tunnel = session->tunnel; - - BUG_ON(refcount_read(&session->ref_count) != 0); - - if (tunnel) { - BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); - l2tp_tunnel_dec_refcount(tunnel); - } - - kfree(session); -} -EXPORT_SYMBOL_GPL(l2tp_session_free); - -/* Remove an l2tp session from l2tp_core's hash lists. - * Provides a tidyup interface for pseudowire code which can't just route all - * shutdown via. l2tp_session_delete and a pseudowire-specific session_close - * callback. - */ -void __l2tp_session_unhash(struct l2tp_session *session) -{ - struct l2tp_tunnel *tunnel = session->tunnel; - - /* Remove the session from core hashes */ - if (tunnel) { - /* Remove from the per-tunnel hash */ - write_lock_bh(&tunnel->hlist_lock); - hlist_del_init(&session->hlist); - write_unlock_bh(&tunnel->hlist_lock); - - /* For L2TPv3 we have a per-net hash: remove from there, too */ - if (tunnel->version != L2TP_HDR_VER_2) { - struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); - spin_lock_bh(&pn->l2tp_session_hlist_lock); - hlist_del_init_rcu(&session->global_hlist); - spin_unlock_bh(&pn->l2tp_session_hlist_lock); - synchronize_rcu(); - } - } -} -EXPORT_SYMBOL_GPL(__l2tp_session_unhash); - -/* This function is used by the netlink SESSION_DELETE command and by - pseudowire modules. - */ -int l2tp_session_delete(struct l2tp_session *session) +void l2tp_session_delete(struct l2tp_session *session) { if (test_and_set_bit(0, &session->dead)) - return 0; + return; - __l2tp_session_unhash(session); + l2tp_session_unhash(session); l2tp_session_queue_purge(session); - if (session->session_close != NULL) + if (session->session_close) (*session->session_close)(session); l2tp_session_dec_refcount(session); - - return 0; } EXPORT_SYMBOL_GPL(l2tp_session_delete); @@ -1636,16 +1652,16 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version) if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP) session->hdr_len += 4; } - } EXPORT_SYMBOL_GPL(l2tp_session_set_header_len); -struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) +struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, + u32 peer_session_id, struct l2tp_session_cfg *cfg) { struct l2tp_session *session; - session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL); - if (session != NULL) { + session = kzalloc(sizeof(*session) + priv_size, GFP_KERNEL); + if (session) { session->magic = L2TP_SESSION_MAGIC; session->tunnel = tunnel; @@ -1687,11 +1703,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn memcpy(&session->peer_cookie[0], &cfg->peer_cookie[0], cfg->peer_cookie_len); } - if (tunnel->version == L2TP_HDR_VER_2) - session->build_header = l2tp_build_l2tpv2_header; - else - session->build_header = l2tp_build_l2tpv3_header; - l2tp_session_set_header_len(session, tunnel->version); refcount_set(&session->ref_count, 1); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 10cf7c3dcbb3..3468d6b177a0 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* - * L2TP internal definitions. +/* L2TP internal definitions. * * Copyright (c) 2008,2009 Katalix Systems Ltd */ @@ -16,17 +15,17 @@ #include <net/xfrm.h> #endif -/* Just some random numbers */ +/* Random numbers used for internal consistency checks of tunnel and session structures */ #define L2TP_TUNNEL_MAGIC 0x42114DDA #define L2TP_SESSION_MAGIC 0x0C04EB7D -/* Per tunnel, session hash table size */ +/* Per tunnel session hash table size */ #define L2TP_HASH_BITS 4 -#define L2TP_HASH_SIZE (1 << L2TP_HASH_BITS) +#define L2TP_HASH_SIZE BIT(L2TP_HASH_BITS) -/* System-wide, session hash table size */ +/* System-wide session hash table size */ #define L2TP_HASH_BITS_2 8 -#define L2TP_HASH_SIZE_2 (1 << L2TP_HASH_BITS_2) +#define L2TP_HASH_SIZE_2 BIT(L2TP_HASH_BITS_2) struct sk_buff; @@ -44,37 +43,34 @@ struct l2tp_stats { struct l2tp_tunnel; -/* Describes a session. Contains information to determine incoming - * packets and transmit outgoing ones. - */ +/* L2TP session configuration */ struct l2tp_session_cfg { enum l2tp_pwtype pw_type; - unsigned int recv_seq:1; /* expect receive packets with - * sequence numbers? */ - unsigned int send_seq:1; /* send packets with sequence - * numbers? */ - unsigned int lns_mode:1; /* behave as LNS? LAC enables - * sequence numbers under - * control of LNS. */ - int debug; /* bitmask of debug message - * categories */ + unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ + unsigned int send_seq:1; /* send packets with sequence numbers? */ + unsigned int lns_mode:1; /* behave as LNS? + * LAC enables sequence numbers under LNS control. + */ + int debug; /* bitmask of debug message categories */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ int cookie_len; /* 0, 4 or 8 bytes */ u8 peer_cookie[8]; /* peer's cookie */ int peer_cookie_len; /* 0, 4 or 8 bytes */ - int reorder_timeout; /* configured reorder timeout - * (in jiffies) */ + int reorder_timeout; /* configured reorder timeout (in jiffies) */ char *ifname; }; +/* Represents a session (pseudowire) instance. + * Tracks runtime state including cookies, dataplane packet sequencing, and IO statistics. + * Is linked into a per-tunnel session hashlist; and in the case of an L2TPv3 session into + * an additional per-net ("global") hashlist. + */ struct l2tp_session { - int magic; /* should be - * L2TP_SESSION_MAGIC */ + int magic; /* should be L2TP_SESSION_MAGIC */ long dead; - struct l2tp_tunnel *tunnel; /* back pointer to tunnel - * context */ + struct l2tp_tunnel *tunnel; /* back pointer to tunnel context */ u32 session_id; u32 peer_session_id; u8 cookie[8]; @@ -89,42 +85,53 @@ struct l2tp_session { u32 nr_max; /* max NR. Depends on tunnel */ u32 nr_window_size; /* NR window size */ u32 nr_oos; /* NR of last OOS packet */ - int nr_oos_count; /* For OOS recovery */ + int nr_oos_count; /* for OOS recovery */ int nr_oos_count_max; - struct hlist_node hlist; /* Hash list node */ + struct hlist_node hlist; /* hash list node */ refcount_t ref_count; char name[32]; /* for logging */ char ifname[IFNAMSIZ]; - unsigned int recv_seq:1; /* expect receive packets with - * sequence numbers? */ - unsigned int send_seq:1; /* send packets with sequence - * numbers? */ - unsigned int lns_mode:1; /* behave as LNS? LAC enables - * sequence numbers under - * control of LNS. */ - int debug; /* bitmask of debug message - * categories */ - int reorder_timeout; /* configured reorder timeout - * (in jiffies) */ + unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ + unsigned int send_seq:1; /* send packets with sequence numbers? */ + unsigned int lns_mode:1; /* behave as LNS? + * LAC enables sequence numbers under LNS control. + */ + int debug; /* bitmask of debug message categories */ + int reorder_timeout; /* configured reorder timeout (in jiffies) */ int reorder_skip; /* set if skip to next nr */ enum l2tp_pwtype pwtype; struct l2tp_stats stats; - struct hlist_node global_hlist; /* Global hash list node */ + struct hlist_node global_hlist; /* global hash list node */ - int (*build_header)(struct l2tp_session *session, void *buf); + /* Session receive handler for data packets. + * Each pseudowire implementation should implement this callback in order to + * handle incoming packets. Packets are passed to the pseudowire handler after + * reordering, if data sequence numbers are enabled for the session. + */ void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len); + + /* Session close handler. + * Each pseudowire implementation may implement this callback in order to carry + * out pseudowire-specific shutdown actions. + * The callback is called by core after unhashing the session and purging its + * reorder queue. + */ void (*session_close)(struct l2tp_session *session); + + /* Session show handler. + * Pseudowire-specific implementation of debugfs session rendering. + * The callback is called by l2tp_debugfs.c after rendering core session + * information. + */ void (*show)(struct seq_file *m, void *priv); - u8 priv[]; /* private data */ + + u8 priv[]; /* private data */ }; -/* Describes the tunnel. It contains info to track all the associated - * sessions so incoming packets can be sorted out - */ +/* L2TP tunnel configuration */ struct l2tp_tunnel_cfg { - int debug; /* bitmask of debug message - * categories */ + int debug; /* bitmask of debug message categories */ enum l2tp_encap_type encap; /* Used only for kernel-created sockets */ @@ -141,6 +148,12 @@ struct l2tp_tunnel_cfg { udp6_zero_rx_checksums:1; }; +/* Represents a tunnel instance. + * Tracks runtime state including IO statistics. + * Holds the tunnel socket (either passed from userspace or directly created by the kernel). + * Maintains a hashlist of sessions belonging to the tunnel instance. + * Is linked into a per-net list of tunnels. + */ struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ @@ -148,40 +161,51 @@ struct l2tp_tunnel { struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ - bool acpt_newsess; /* Indicates whether this - * tunnel accepts new sessions. - * Protected by hlist_lock. + bool acpt_newsess; /* indicates whether this tunnel accepts + * new sessions. Protected by hlist_lock. */ struct hlist_head session_hlist[L2TP_HASH_SIZE]; - /* hashed list of sessions, - * hashed by id */ + /* hashed list of sessions, hashed by id */ u32 tunnel_id; u32 peer_tunnel_id; int version; /* 2=>L2TPv2, 3=>L2TPv3 */ char name[20]; /* for logging */ - int debug; /* bitmask of debug message - * categories */ + int debug; /* bitmask of debug message categories */ enum l2tp_encap_type encap; struct l2tp_stats stats; - struct list_head list; /* Keep a list of all tunnels */ + struct list_head list; /* list node on per-namespace list of tunnels */ struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; - void (*old_sk_destruct)(struct sock *); - struct sock *sock; /* Parent socket */ - int fd; /* Parent fd, if tunnel socket - * was created by userspace */ + void (*old_sk_destruct)(struct sock *sk); + struct sock *sock; /* parent socket */ + int fd; /* parent fd, if tunnel socket was created + * by userspace + */ struct work_struct del_work; }; +/* Pseudowire ops callbacks for use with the l2tp genetlink interface */ struct l2tp_nl_cmd_ops { + /* The pseudowire session create callback is responsible for creating a session + * instance for a specific pseudowire type. + * It must call l2tp_session_create and l2tp_session_register to register the + * session instance, as well as carry out any pseudowire-specific initialisation. + * It must return >= 0 on success, or an appropriate negative errno value on failure. + */ int (*session_create)(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); - int (*session_delete)(struct l2tp_session *session); + + /* The pseudowire session delete callback is responsible for initiating the deletion + * of a session instance. + * It must call l2tp_session_delete, as well as carry out any pseudowire-specific + * teardown actions. + */ + void (*session_delete)(struct l2tp_session *session); }; static inline void *l2tp_session_priv(struct l2tp_session *session) @@ -189,73 +213,68 @@ static inline void *l2tp_session_priv(struct l2tp_session *session) return &session->priv[0]; } +/* Tunnel and session refcounts */ +void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel); +void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel); +void l2tp_session_inc_refcount(struct l2tp_session *session); +void l2tp_session_dec_refcount(struct l2tp_session *session); + +/* Tunnel and session lookup. + * These functions take a reference on the instances they return, so + * the caller must ensure that the reference is dropped appropriately. + */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth); struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel, u32 session_id); -void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); - struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id); struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); +/* Tunnel and session lifetime management. + * Creation of a new instance is a two-step process: create, then register. + * Destruction is triggered using the *_delete functions, and completes asynchronously. + */ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg); - void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); + struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); int l2tp_session_register(struct l2tp_session *session, struct l2tp_tunnel *tunnel); +void l2tp_session_delete(struct l2tp_session *session); -void __l2tp_session_unhash(struct l2tp_session *session); -int l2tp_session_delete(struct l2tp_session *session); -void l2tp_session_free(struct l2tp_session *session); +/* Receive path helpers. If data sequencing is enabled for the session these + * functions handle queuing and reordering prior to passing packets to the + * pseudowire code to be passed to userspace. + */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length); int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); -void l2tp_session_set_header_len(struct l2tp_session *session, int version); +/* Transmit path helpers for sending packets over the tunnel socket. */ +void l2tp_session_set_header_len(struct l2tp_session *session, int version); int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len); -int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, - const struct l2tp_nl_cmd_ops *ops); -void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); -int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); - -static inline void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel) -{ - refcount_inc(&tunnel->ref_count); -} - -static inline void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) -{ - if (refcount_dec_and_test(&tunnel->ref_count)) - l2tp_tunnel_free(tunnel); -} - -/* Session reference counts. Incremented when code obtains a reference - * to a session. +/* Pseudowire management. + * Pseudowires should register with l2tp core on module init, and unregister + * on module exit. */ -static inline void l2tp_session_inc_refcount(struct l2tp_session *session) -{ - refcount_inc(&session->ref_count); -} +int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops); +void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); -static inline void l2tp_session_dec_refcount(struct l2tp_session *session) -{ - if (refcount_dec_and_test(&session->ref_count)) - l2tp_session_free(session); -} +/* IOCTL helper for IP encap modules. */ +int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); static inline int l2tp_get_l2specific_len(struct l2tp_session *session) { diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 35bb4f3bdbe0..96cb9601c21b 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * L2TP subsystem debugfs +/* L2TP subsystem debugfs * * Copyright (c) 2010 Katalix Systems Ltd */ @@ -59,11 +58,10 @@ static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd) pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); pd->session_idx++; - if (pd->session == NULL) { + if (!pd->session) { pd->session_idx = 0; l2tp_dfs_next_tunnel(pd); } - } static void *l2tp_dfs_seq_start(struct seq_file *m, loff_t *offs) @@ -74,23 +72,25 @@ static void *l2tp_dfs_seq_start(struct seq_file *m, loff_t *offs) if (!pos) goto out; - BUG_ON(m->private == NULL); + if (WARN_ON(!m->private)) { + pd = NULL; + goto out; + } pd = m->private; - if (pd->tunnel == NULL) + if (!pd->tunnel) l2tp_dfs_next_tunnel(pd); else l2tp_dfs_next_session(pd); /* NULL tunnel and session indicates end of list */ - if ((pd->tunnel == NULL) && (pd->session == NULL)) + if (!pd->tunnel && !pd->session) pd = NULL; out: return pd; } - static void *l2tp_dfs_seq_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; @@ -148,11 +148,13 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) const struct ipv6_pinfo *np = inet6_sk(tunnel->sock); seq_printf(m, " from %pI6c to %pI6c\n", - &np->saddr, &tunnel->sock->sk_v6_daddr); - } else + &np->saddr, &tunnel->sock->sk_v6_daddr); + } #endif - seq_printf(m, " from %pI4 to %pI4\n", - &inet->inet_saddr, &inet->inet_daddr); + if (tunnel->sock->sk_family == AF_INET) + seq_printf(m, " from %pI4 to %pI4\n", + &inet->inet_saddr, &inet->inet_daddr); + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) seq_printf(m, " source port %hu, dest port %hu\n", ntohs(inet->inet_sport), ntohs(inet->inet_dport)); @@ -202,7 +204,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) seq_printf(m, "%02x%02x%02x%02x", session->cookie[4], session->cookie[5], session->cookie[6], session->cookie[7]); - seq_printf(m, "\n"); + seq_puts(m, "\n"); } if (session->peer_cookie_len) { seq_printf(m, " peer cookie %02x%02x%02x%02x", @@ -212,7 +214,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) seq_printf(m, "%02x%02x%02x%02x", session->peer_cookie[4], session->peer_cookie[5], session->peer_cookie[6], session->peer_cookie[7]); - seq_printf(m, "\n"); + seq_puts(m, "\n"); } seq_printf(m, " %hu/%hu tx %ld/%ld/%ld rx %ld/%ld/%ld\n", @@ -224,7 +226,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) atomic_long_read(&session->stats.rx_bytes), atomic_long_read(&session->stats.rx_errors)); - if (session->show != NULL) + if (session->show) session->show(m, session); } @@ -271,7 +273,7 @@ static int l2tp_dfs_seq_open(struct inode *inode, struct file *file) int rc = -ENOMEM; pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (pd == NULL) + if (!pd) goto out; /* Derive the network namespace from the pid opening the diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index fd5ac2788e45..7ed2b4eced94 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * L2TPv3 ethernet pseudowire driver +/* L2TPv3 ethernet pseudowire driver * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd */ @@ -51,7 +50,6 @@ struct l2tp_eth_sess { struct net_device __rcu *dev; }; - static int l2tp_eth_dev_init(struct net_device *dev) { eth_hw_addr_random(dev); @@ -73,7 +71,7 @@ static void l2tp_eth_dev_uninit(struct net_device *dev) */ } -static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct l2tp_eth *priv = netdev_priv(dev); struct l2tp_session *session = priv->session; @@ -94,13 +92,12 @@ static void l2tp_eth_get_stats64(struct net_device *dev, { struct l2tp_eth *priv = netdev_priv(dev); - stats->tx_bytes = (unsigned long) atomic_long_read(&priv->tx_bytes); - stats->tx_packets = (unsigned long) atomic_long_read(&priv->tx_packets); - stats->tx_dropped = (unsigned long) atomic_long_read(&priv->tx_dropped); - stats->rx_bytes = (unsigned long) atomic_long_read(&priv->rx_bytes); - stats->rx_packets = (unsigned long) atomic_long_read(&priv->rx_packets); - stats->rx_errors = (unsigned long) atomic_long_read(&priv->rx_errors); - + stats->tx_bytes = (unsigned long)atomic_long_read(&priv->tx_bytes); + stats->tx_packets = (unsigned long)atomic_long_read(&priv->tx_packets); + stats->tx_dropped = (unsigned long)atomic_long_read(&priv->tx_dropped); + stats->rx_bytes = (unsigned long)atomic_long_read(&priv->rx_bytes); + stats->rx_packets = (unsigned long)atomic_long_read(&priv->rx_packets); + stats->rx_errors = (unsigned long)atomic_long_read(&priv->rx_errors); } static const struct net_device_ops l2tp_eth_netdev_ops = { @@ -348,13 +345,11 @@ err: return rc; } - static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { .session_create = l2tp_eth_create, .session_delete = l2tp_session_delete, }; - static int __init l2tp_eth_init(void) { int err = 0; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 955662a6dee7..df2a35b5714a 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * L2TPv3 IP encapsulation support +/* L2TPv3 IP encapsulation support * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd */ @@ -125,8 +124,9 @@ static int l2tp_ip_recv(struct sk_buff *skb) goto discard; /* Point to L2TP header */ - optr = ptr = skb->data; - session_id = ntohl(*((__be32 *) ptr)); + optr = skb->data; + ptr = skb->data; + session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing @@ -154,7 +154,8 @@ static int l2tp_ip_recv(struct sk_buff *skb) goto discard_sess; /* Point to L2TP header */ - optr = ptr = skb->data; + optr = skb->data; + ptr = skb->data; ptr += 4; pr_debug("%s: ip recv\n", tunnel->name); print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); @@ -176,7 +177,7 @@ pass_up: if ((skb->data[0] & 0xc0) != 0xc0) goto discard; - tunnel_id = ntohl(*(__be32 *) &skb->data[4]); + tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = (struct iphdr *)skb_network_header(skb); read_lock_bh(&l2tp_ip_lock); @@ -260,7 +261,7 @@ static void l2tp_ip_destroy_sock(struct sock *sk) static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); - struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *) uaddr; + struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *)uaddr; struct net *net = sock_net(sk); int ret; int chk_addr_ret; @@ -285,8 +286,10 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; - if (addr->l2tp_addr.s_addr) - inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr; + if (addr->l2tp_addr.s_addr) { + inet->inet_rcv_saddr = addr->l2tp_addr.s_addr; + inet->inet_saddr = addr->l2tp_addr.s_addr; + } if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ @@ -316,7 +319,7 @@ out: static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *) uaddr; + struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; int rc; if (addr_len < sizeof(*lsa)) @@ -375,6 +378,7 @@ static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, lsa->l2tp_addr.s_addr = inet->inet_daddr; } else { __be32 addr = inet->inet_rcv_saddr; + if (!addr) addr = inet->inet_saddr; lsa->l2tp_conn_id = lsk->conn_id; @@ -422,6 +426,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* Get and verify the address. */ if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_l2tpip *, lip, msg->msg_name); + rc = -EINVAL; if (msg->msg_namelen < sizeof(*lip)) goto out; @@ -456,7 +461,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) skb_reset_transport_header(skb); /* Insert 0 session_id */ - *((__be32 *) skb_put(skb, 4)) = 0; + *((__be32 *)skb_put(skb, 4)) = 0; /* Copy user data into skb */ rc = memcpy_from_msg(skb_put(skb, len), msg, len); @@ -467,10 +472,10 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &inet->cork.fl.u.ip4; if (connected) - rt = (struct rtable *) __sk_dst_check(sk, 0); + rt = (struct rtable *)__sk_dst_check(sk, 0); rcu_read_lock(); - if (rt == NULL) { + if (!rt) { const struct ip_options_rcu *inet_opt; inet_opt = rcu_dereference(inet->inet_opt); @@ -592,7 +597,7 @@ int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(amount, (int __user *)arg); } -EXPORT_SYMBOL(l2tp_ioctl); +EXPORT_SYMBOL_GPL(l2tp_ioctl); static struct proto l2tp_ip_prot = { .name = "L2TP/IP", @@ -612,10 +617,6 @@ static struct proto l2tp_ip_prot = { .hash = l2tp_ip_hash, .unhash = l2tp_ip_unhash, .obj_size = sizeof(struct l2tp_ip_sock), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ip_setsockopt, - .compat_getsockopt = compat_ip_getsockopt, -#endif }; static const struct proto_ops l2tp_ip_ops = { @@ -638,10 +639,6 @@ static const struct proto_ops l2tp_ip_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; static struct inet_protosw l2tp_ip_protosw = { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 526ed2c24dd5..bc757bc7e264 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * L2TPv3 IP encapsulation support for IPv6 +/* L2TPv3 IP encapsulation support for IPv6 * * Copyright (c) 2012 Katalix Systems Ltd */ @@ -38,7 +37,8 @@ struct l2tp_ip6_sock { u32 peer_conn_id; /* ipv6_pinfo has to be the last member of l2tp_ip6_sock, see - inet6_sk_generic */ + * inet6_sk_generic + */ struct ipv6_pinfo inet6; }; @@ -137,8 +137,9 @@ static int l2tp_ip6_recv(struct sk_buff *skb) goto discard; /* Point to L2TP header */ - optr = ptr = skb->data; - session_id = ntohl(*((__be32 *) ptr)); + optr = skb->data; + ptr = skb->data; + session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing @@ -166,7 +167,8 @@ static int l2tp_ip6_recv(struct sk_buff *skb) goto discard_sess; /* Point to L2TP header */ - optr = ptr = skb->data; + optr = skb->data; + ptr = skb->data; ptr += 4; pr_debug("%s: ip recv\n", tunnel->name); print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); @@ -188,7 +190,7 @@ pass_up: if ((skb->data[0] & 0xc0) != 0xc0) goto discard; - tunnel_id = ntohl(*(__be32 *) &skb->data[4]); + tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = ipv6_hdr(skb); read_lock_bh(&l2tp_ip6_lock); @@ -276,7 +278,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr; + struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *)uaddr; struct net *net = sock_net(sk); __be32 v4addr = 0; int bound_dev_if; @@ -375,8 +377,8 @@ out_unlock: static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *) uaddr; - struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; + struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; struct in6_addr *daddr; int addr_type; int rc; @@ -486,7 +488,7 @@ static int l2tp_ip6_push_pending_frames(struct sock *sk) int err = 0; skb = skb_peek(&sk->sk_write_queue); - if (skb == NULL) + if (!skb) goto out; transhdr = (__be32 *)skb_transport_header(skb); @@ -519,7 +521,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; /* Rough check on arithmetic overflow, - better check is made in ip6_append_data(). + * better check is made in ip6_append_data(). */ if (len > INT_MAX) return -EMSGSIZE; @@ -528,9 +530,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; - /* - * Get and verify the address. - */ + /* Get and verify the address */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = sk->sk_mark; @@ -548,15 +548,14 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = &lsa->l2tp_addr; if (np->sndflow) { fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK; - if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } } - /* - * Otherwise it will be difficult to maintain + /* Otherwise it will be difficult to maintain * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && @@ -594,7 +593,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (IS_ERR(flowlabel)) return -EINVAL; } - if (!(opt->opt_nflen|opt->opt_flen)) + if (!(opt->opt_nflen | opt->opt_flen)) opt = NULL; } @@ -745,10 +744,6 @@ static struct proto l2tp_ip6_prot = { .hash = l2tp_ip6_hash, .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif }; static const struct proto_ops l2tp_ip6_ops = { @@ -773,8 +768,6 @@ static const struct proto_ops l2tp_ip6_ops = { .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index ebb381c3f1b9..def78eebca4c 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* - * L2TP netlink layer, for management +/* L2TP netlink layer, for management * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd * @@ -27,7 +26,6 @@ #include "l2tp_core.h" - static struct genl_family l2tp_nl_family; static const struct genl_multicast_group l2tp_multicast_group[] = { @@ -157,81 +155,82 @@ static int l2tp_session_notify(struct genl_family *family, return ret; } +static int l2tp_nl_cmd_tunnel_create_get_addr(struct nlattr **attrs, struct l2tp_tunnel_cfg *cfg) +{ + if (attrs[L2TP_ATTR_UDP_SPORT]) + cfg->local_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_SPORT]); + if (attrs[L2TP_ATTR_UDP_DPORT]) + cfg->peer_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_DPORT]); + cfg->use_udp_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_CSUM]); + + /* Must have either AF_INET or AF_INET6 address for source and destination */ +#if IS_ENABLED(CONFIG_IPV6) + if (attrs[L2TP_ATTR_IP6_SADDR] && attrs[L2TP_ATTR_IP6_DADDR]) { + cfg->local_ip6 = nla_data(attrs[L2TP_ATTR_IP6_SADDR]); + cfg->peer_ip6 = nla_data(attrs[L2TP_ATTR_IP6_DADDR]); + cfg->udp6_zero_tx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]); + cfg->udp6_zero_rx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]); + return 0; + } +#endif + if (attrs[L2TP_ATTR_IP_SADDR] && attrs[L2TP_ATTR_IP_DADDR]) { + cfg->local_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_SADDR]); + cfg->peer_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_DADDR]); + return 0; + } + return -EINVAL; +} + static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info) { u32 tunnel_id; u32 peer_tunnel_id; int proto_version; - int fd; + int fd = -1; int ret = 0; struct l2tp_tunnel_cfg cfg = { 0, }; struct l2tp_tunnel *tunnel; struct net *net = genl_info_net(info); + struct nlattr **attrs = info->attrs; - if (!info->attrs[L2TP_ATTR_CONN_ID]) { + if (!attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } - tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); + tunnel_id = nla_get_u32(attrs[L2TP_ATTR_CONN_ID]); - if (!info->attrs[L2TP_ATTR_PEER_CONN_ID]) { + if (!attrs[L2TP_ATTR_PEER_CONN_ID]) { ret = -EINVAL; goto out; } - peer_tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_CONN_ID]); + peer_tunnel_id = nla_get_u32(attrs[L2TP_ATTR_PEER_CONN_ID]); - if (!info->attrs[L2TP_ATTR_PROTO_VERSION]) { + if (!attrs[L2TP_ATTR_PROTO_VERSION]) { ret = -EINVAL; goto out; } - proto_version = nla_get_u8(info->attrs[L2TP_ATTR_PROTO_VERSION]); + proto_version = nla_get_u8(attrs[L2TP_ATTR_PROTO_VERSION]); - if (!info->attrs[L2TP_ATTR_ENCAP_TYPE]) { + if (!attrs[L2TP_ATTR_ENCAP_TYPE]) { ret = -EINVAL; goto out; } - cfg.encap = nla_get_u16(info->attrs[L2TP_ATTR_ENCAP_TYPE]); - - fd = -1; - if (info->attrs[L2TP_ATTR_FD]) { - fd = nla_get_u32(info->attrs[L2TP_ATTR_FD]); + cfg.encap = nla_get_u16(attrs[L2TP_ATTR_ENCAP_TYPE]); + + /* Managed tunnels take the tunnel socket from userspace. + * Unmanaged tunnels must call out the source and destination addresses + * for the kernel to create the tunnel socket itself. + */ + if (attrs[L2TP_ATTR_FD]) { + fd = nla_get_u32(attrs[L2TP_ATTR_FD]); } else { -#if IS_ENABLED(CONFIG_IPV6) - if (info->attrs[L2TP_ATTR_IP6_SADDR] && - info->attrs[L2TP_ATTR_IP6_DADDR]) { - cfg.local_ip6 = nla_data( - info->attrs[L2TP_ATTR_IP6_SADDR]); - cfg.peer_ip6 = nla_data( - info->attrs[L2TP_ATTR_IP6_DADDR]); - } else -#endif - if (info->attrs[L2TP_ATTR_IP_SADDR] && - info->attrs[L2TP_ATTR_IP_DADDR]) { - cfg.local_ip.s_addr = nla_get_in_addr( - info->attrs[L2TP_ATTR_IP_SADDR]); - cfg.peer_ip.s_addr = nla_get_in_addr( - info->attrs[L2TP_ATTR_IP_DADDR]); - } else { - ret = -EINVAL; + ret = l2tp_nl_cmd_tunnel_create_get_addr(attrs, &cfg); + if (ret < 0) goto out; - } - if (info->attrs[L2TP_ATTR_UDP_SPORT]) - cfg.local_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_SPORT]); - if (info->attrs[L2TP_ATTR_UDP_DPORT]) - cfg.peer_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_DPORT]); - cfg.use_udp_checksums = nla_get_flag( - info->attrs[L2TP_ATTR_UDP_CSUM]); - -#if IS_ENABLED(CONFIG_IPV6) - cfg.udp6_zero_tx_checksums = nla_get_flag( - info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]); - cfg.udp6_zero_rx_checksums = nla_get_flag( - info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]); -#endif } - if (info->attrs[L2TP_ATTR_DEBUG]) - cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); + if (attrs[L2TP_ATTR_DEBUG]) + cfg.debug = nla_get_u32(attrs[L2TP_ATTR_DEBUG]); ret = -EINVAL; switch (cfg.encap) { @@ -320,16 +319,79 @@ out: return ret; } +#if IS_ENABLED(CONFIG_IPV6) +static int l2tp_nl_tunnel_send_addr6(struct sk_buff *skb, struct sock *sk, + enum l2tp_encap_type encap) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + + switch (encap) { + case L2TP_ENCAPTYPE_UDP: + if (udp_get_no_check6_tx(sk) && + nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX)) + return -1; + if (udp_get_no_check6_rx(sk) && + nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX)) + return -1; + if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || + nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) + return -1; + fallthrough; + case L2TP_ENCAPTYPE_IP: + if (nla_put_in6_addr(skb, L2TP_ATTR_IP6_SADDR, &np->saddr) || + nla_put_in6_addr(skb, L2TP_ATTR_IP6_DADDR, &sk->sk_v6_daddr)) + return -1; + break; + } + return 0; +} +#endif + +static int l2tp_nl_tunnel_send_addr4(struct sk_buff *skb, struct sock *sk, + enum l2tp_encap_type encap) +{ + struct inet_sock *inet = inet_sk(sk); + + switch (encap) { + case L2TP_ENCAPTYPE_UDP: + if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx) || + nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || + nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) + return -1; + fallthrough; + case L2TP_ENCAPTYPE_IP: + if (nla_put_in_addr(skb, L2TP_ATTR_IP_SADDR, inet->inet_saddr) || + nla_put_in_addr(skb, L2TP_ATTR_IP_DADDR, inet->inet_daddr)) + return -1; + break; + } + + return 0; +} + +/* Append attributes for the tunnel address, handling the different attribute types + * used for different tunnel encapsulation and AF_INET v.s. AF_INET6. + */ +static int l2tp_nl_tunnel_send_addr(struct sk_buff *skb, struct l2tp_tunnel *tunnel) +{ + struct sock *sk = tunnel->sock; + + if (!sk) + return 0; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return l2tp_nl_tunnel_send_addr6(skb, sk, tunnel->encap); +#endif + return l2tp_nl_tunnel_send_addr4(skb, sk, tunnel->encap); +} + static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_tunnel *tunnel, u8 cmd) { void *hdr; struct nlattr *nest; - struct sock *sk = NULL; - struct inet_sock *inet; -#if IS_ENABLED(CONFIG_IPV6) - struct ipv6_pinfo *np = NULL; -#endif hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd); if (!hdr) @@ -343,7 +405,7 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla goto nla_put_failure; nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); - if (nest == NULL) + if (!nest) goto nla_put_failure; if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, @@ -373,58 +435,9 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla goto nla_put_failure; nla_nest_end(skb, nest); - sk = tunnel->sock; - if (!sk) - goto out; - -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) - np = inet6_sk(sk); -#endif - - inet = inet_sk(sk); - - switch (tunnel->encap) { - case L2TP_ENCAPTYPE_UDP: - switch (sk->sk_family) { - case AF_INET: - if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx)) - goto nla_put_failure; - break; -#if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - if (udp_get_no_check6_tx(sk) && - nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX)) - goto nla_put_failure; - if (udp_get_no_check6_rx(sk) && - nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX)) - goto nla_put_failure; - break; -#endif - } - if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || - nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) - goto nla_put_failure; - /* fall through */ - case L2TP_ENCAPTYPE_IP: -#if IS_ENABLED(CONFIG_IPV6) - if (np) { - if (nla_put_in6_addr(skb, L2TP_ATTR_IP6_SADDR, - &np->saddr) || - nla_put_in6_addr(skb, L2TP_ATTR_IP6_DADDR, - &sk->sk_v6_daddr)) - goto nla_put_failure; - } else -#endif - if (nla_put_in_addr(skb, L2TP_ATTR_IP_SADDR, - inet->inet_saddr) || - nla_put_in_addr(skb, L2TP_ATTR_IP_DADDR, - inet->inet_daddr)) - goto nla_put_failure; - break; - } + if (l2tp_nl_tunnel_send_addr(skb, tunnel)) + goto nla_put_failure; -out: genlmsg_end(skb, hdr); return 0; @@ -485,7 +498,7 @@ static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback for (;;) { tunnel = l2tp_tunnel_get_nth(net, ti); - if (tunnel == NULL) + if (!tunnel) goto out; if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid, @@ -570,6 +583,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if (info->attrs[L2TP_ATTR_COOKIE]) { u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]); + if (len > 8) { ret = -EINVAL; goto out_tunnel; @@ -579,6 +593,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf } if (info->attrs[L2TP_ATTR_PEER_COOKIE]) { u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]); + if (len > 8) { ret = -EINVAL; goto out_tunnel; @@ -606,14 +621,13 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); #ifdef CONFIG_MODULES - if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) { + if (!l2tp_nl_cmd_ops[cfg.pw_type]) { genl_unlock(); request_module("net-l2tp-type-%u", cfg.pw_type); genl_lock(); } #endif - if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) || - (l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) { + if (!l2tp_nl_cmd_ops[cfg.pw_type] || !l2tp_nl_cmd_ops[cfg.pw_type]->session_create) { ret = -EPROTONOSUPPORT; goto out_tunnel; } @@ -645,7 +659,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf u16 pw_type; session = l2tp_nl_session_get(info); - if (session == NULL) { + if (!session) { ret = -ENODEV; goto out; } @@ -656,7 +670,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf pw_type = session->pwtype; if (pw_type < __L2TP_PWTYPE_MAX) if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete) - ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session); + l2tp_nl_cmd_ops[pw_type]->session_delete(session); l2tp_session_dec_refcount(session); @@ -670,7 +684,7 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf struct l2tp_session *session; session = l2tp_nl_session_get(info); - if (session == NULL) { + if (!session) { ret = -ENODEV; goto out; } @@ -715,8 +729,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl if (nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || - nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, - session->peer_session_id) || + nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) || nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) || nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype)) goto nla_put_failure; @@ -724,11 +737,9 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl if ((session->ifname[0] && nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) || (session->cookie_len && - nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len, - &session->cookie[0])) || + nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len, session->cookie)) || (session->peer_cookie_len && - nla_put(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len, - &session->peer_cookie[0])) || + nla_put(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len, session->peer_cookie)) || nla_put_u8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq) || nla_put_u8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq) || nla_put_u8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode) || @@ -740,7 +751,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl goto nla_put_failure; nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); - if (nest == NULL) + if (!nest) goto nla_put_failure; if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, @@ -785,7 +796,7 @@ static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info) int ret; session = l2tp_nl_session_get(info); - if (session == NULL) { + if (!session) { ret = -ENODEV; goto err; } @@ -824,14 +835,14 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback int si = cb->args[1]; for (;;) { - if (tunnel == NULL) { + if (!tunnel) { tunnel = l2tp_tunnel_get_nth(net, ti); - if (tunnel == NULL) + if (!tunnel) goto out; } session = l2tp_session_get_nth(tunnel, si); - if (session == NULL) { + if (!session) { ti++; l2tp_tunnel_dec_refcount(tunnel); tunnel = NULL; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index c54cb59593ef..13c3153b40d6 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -117,8 +117,7 @@ struct pppol2tp_session { int owner; /* pid that opened the socket */ struct mutex sk_lock; /* Protects .sk */ - struct sock __rcu *sk; /* Pointer to the session - * PPPoX socket */ + struct sock __rcu *sk; /* Pointer to the session PPPoX socket */ struct sock *__sk; /* Copy of .sk, for cleanup */ struct rcu_head rcu; /* For asynchronous release */ }; @@ -155,17 +154,20 @@ static inline struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk) { struct l2tp_session *session; - if (sk == NULL) + if (!sk) return NULL; sock_hold(sk); session = (struct l2tp_session *)(sk->sk_user_data); - if (session == NULL) { + if (!session) { + sock_put(sk); + goto out; + } + if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) { + session = NULL; sock_put(sk); goto out; } - - BUG_ON(session->magic != L2TP_SESSION_MAGIC); out: return session; @@ -218,7 +220,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int */ rcu_read_lock(); sk = rcu_dereference(ps->sk); - if (sk == NULL) + if (!sk) goto no_sock; /* If the first two bytes are 0xFF03, consider that it is the PPP's @@ -286,7 +288,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, /* Get session and tunnel contexts */ error = -EBADF; session = pppol2tp_sock_to_session(sk); - if (session == NULL) + if (!session) goto error; tunnel = session->tunnel; @@ -351,7 +353,7 @@ error: */ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { - struct sock *sk = (struct sock *) chan->private; + struct sock *sk = (struct sock *)chan->private; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int uhlen, headroom; @@ -361,7 +363,7 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) /* Get session and tunnel contexts from the socket */ session = pppol2tp_sock_to_session(sk); - if (session == NULL) + if (!session) goto abort; tunnel = session->tunnel; @@ -420,7 +422,8 @@ static void pppol2tp_session_destruct(struct sock *sk) if (session) { sk->sk_user_data = NULL; - BUG_ON(session->magic != L2TP_SESSION_MAGIC); + if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) + return; l2tp_session_dec_refcount(session); } } @@ -704,7 +707,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, * tunnel id. */ if (!info.session_id && !info.peer_session_id) { - if (tunnel == NULL) { + if (!tunnel) { struct l2tp_tunnel_cfg tcfg = { .encap = L2TP_ENCAPTYPE_UDP, .debug = 0, @@ -739,11 +742,11 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, } else { /* Error if we can't find the tunnel */ error = -ENOENT; - if (tunnel == NULL) + if (!tunnel) goto end; /* Error if socket is not prepped */ - if (tunnel->sock == NULL) + if (!tunnel->sock) goto end; } @@ -803,8 +806,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, * the internal context for use by ioctl() and sockopt() * handlers. */ - if ((session->session_id == 0) && - (session->peer_session_id == 0)) { + if (session->session_id == 0 && session->peer_session_id == 0) { error = 0; goto out_no_ppp; } @@ -912,22 +914,23 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, struct pppol2tp_session *pls; error = -ENOTCONN; - if (sk == NULL) + if (!sk) goto end; if (!(sk->sk_state & PPPOX_CONNECTED)) goto end; error = -EBADF; session = pppol2tp_sock_to_session(sk); - if (session == NULL) + if (!session) goto end; pls = l2tp_session_priv(session); tunnel = session->tunnel; inet = inet_sk(tunnel->sock); - if ((tunnel->version == 2) && (tunnel->sock->sk_family == AF_INET)) { + if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET) { struct sockaddr_pppol2tp sp; + len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; @@ -943,8 +946,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr; memcpy(uaddr, &sp, len); #if IS_ENABLED(CONFIG_IPV6) - } else if ((tunnel->version == 2) && - (tunnel->sock->sk_family == AF_INET6)) { + } else if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET6) { struct sockaddr_pppol2tpin6 sp; len = sizeof(sp); @@ -962,8 +964,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr, sizeof(tunnel->sock->sk_v6_daddr)); memcpy(uaddr, &sp, len); - } else if ((tunnel->version == 3) && - (tunnel->sock->sk_family == AF_INET6)) { + } else if (tunnel->version == 3 && tunnel->sock->sk_family == AF_INET6) { struct sockaddr_pppol2tpv3in6 sp; len = sizeof(sp); @@ -984,6 +985,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, #endif } else if (tunnel->version == 3) { struct sockaddr_pppol2tpv3 sp; + len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; @@ -1178,7 +1180,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_RECVSEQ: - if ((val != 0) && (val != 1)) { + if (val != 0 && val != 1) { err = -EINVAL; break; } @@ -1189,7 +1191,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, break; case PPPOL2TP_SO_SENDSEQ: - if ((val != 0) && (val != 1)) { + if (val != 0 && val != 1) { err = -EINVAL; break; } @@ -1207,7 +1209,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, break; case PPPOL2TP_SO_LNSMODE: - if ((val != 0) && (val != 1)) { + if (val != 0 && val != 1) { err = -EINVAL; break; } @@ -1244,7 +1246,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, * session or the special tunnel type. */ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2tp_session *session; @@ -1258,23 +1260,22 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; err = -ENOTCONN; - if (sk->sk_user_data == NULL) + if (!sk->sk_user_data) goto end; /* Get session context from the socket */ err = -EBADF; session = pppol2tp_sock_to_session(sk); - if (session == NULL) + if (!session) goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ - if ((session->session_id == 0) && - (session->peer_session_id == 0)) { + if (session->session_id == 0 && session->peer_session_id == 0) { tunnel = session->tunnel; err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val); } else { @@ -1343,7 +1344,7 @@ static int pppol2tp_session_getsockopt(struct sock *sk, break; case PPPOL2TP_SO_REORDERTO: - *val = (int) jiffies_to_msecs(session->reorder_timeout); + *val = (int)jiffies_to_msecs(session->reorder_timeout); l2tp_info(session, L2TP_MSG_CONTROL, "%s: get reorder_timeout=%d\n", session->name, *val); break; @@ -1381,18 +1382,17 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, return -EINVAL; err = -ENOTCONN; - if (sk->sk_user_data == NULL) + if (!sk->sk_user_data) goto end; /* Get the session context */ err = -EBADF; session = pppol2tp_sock_to_session(sk); - if (session == NULL) + if (!session) goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ - if ((session->session_id == 0) && - (session->peer_session_id == 0)) { + if (session->session_id == 0 && session->peer_session_id == 0) { tunnel = session->tunnel; err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val); if (err) @@ -1407,7 +1407,7 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, if (put_user(len, optlen)) goto end_put_sess; - if (copy_to_user((void __user *) optval, &val, len)) + if (copy_to_user((void __user *)optval, &val, len)) goto end_put_sess; err = 0; @@ -1463,7 +1463,7 @@ static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); pd->session_idx++; - if (pd->session == NULL) { + if (!pd->session) { pd->session_idx = 0; pppol2tp_next_tunnel(net, pd); } @@ -1478,17 +1478,21 @@ static void *pppol2tp_seq_start(struct seq_file *m, loff_t *offs) if (!pos) goto out; - BUG_ON(m->private == NULL); + if (WARN_ON(!m->private)) { + pd = NULL; + goto out; + } + pd = m->private; net = seq_file_net(m); - if (pd->tunnel == NULL) + if (!pd->tunnel) pppol2tp_next_tunnel(net, pd); else pppol2tp_next_session(net, pd); /* NULL tunnel and session indicates end of list */ - if ((pd->tunnel == NULL) && (pd->session == NULL)) + if (!pd->tunnel && !pd->session) pd = NULL; out: @@ -1551,6 +1555,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) if (tunnel->sock) { struct inet_sock *inet = inet_sk(tunnel->sock); + ip = ntohl(inet->inet_saddr); port = ntohs(inet->inet_sport); } @@ -1564,8 +1569,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) user_data_ok = 'N'; } - seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> " - "%04X/%04X %d %c\n", + seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> %04X/%04X %d %c\n", session->name, ip, port, tunnel->tunnel_id, session->session_id, @@ -1604,8 +1608,7 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v) seq_puts(m, "PPPoL2TP driver info, " PPPOL2TP_DRV_VERSION "\n"); seq_puts(m, "TUNNEL name, user-data-ok session-count\n"); seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); - seq_puts(m, " SESSION name, addr/port src-tid/sid " - "dest-tid/sid state user-data-ok\n"); + seq_puts(m, " SESSION name, addr/port src-tid/sid dest-tid/sid state user-data-ok\n"); seq_puts(m, " mtu/mru/rcvseq/sendseq/lns debug reorderto\n"); seq_puts(m, " nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); goto out; @@ -1638,7 +1641,7 @@ static __net_init int pppol2tp_init_net(struct net *net) int err = 0; pde = proc_create_net("pppol2tp", 0444, net->proc_net, - &pppol2tp_seq_ops, sizeof(struct pppol2tp_seq_data)); + &pppol2tp_seq_ops, sizeof(struct pppol2tp_seq_data)); if (!pde) { err = -ENOMEM; goto out; diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index f35899d45a9a..e71ca5aec684 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -9,6 +9,99 @@ #include <net/fib_rules.h> #include <net/l3mdev.h> +static DEFINE_SPINLOCK(l3mdev_lock); + +struct l3mdev_handler { + lookup_by_table_id_t dev_lookup; +}; + +static struct l3mdev_handler l3mdev_handlers[L3MDEV_TYPE_MAX + 1]; + +static int l3mdev_check_type(enum l3mdev_type l3type) +{ + if (l3type <= L3MDEV_TYPE_UNSPEC || l3type > L3MDEV_TYPE_MAX) + return -EINVAL; + + return 0; +} + +int l3mdev_table_lookup_register(enum l3mdev_type l3type, + lookup_by_table_id_t fn) +{ + struct l3mdev_handler *hdlr; + int res; + + res = l3mdev_check_type(l3type); + if (res) + return res; + + hdlr = &l3mdev_handlers[l3type]; + + spin_lock(&l3mdev_lock); + + if (hdlr->dev_lookup) { + res = -EBUSY; + goto unlock; + } + + hdlr->dev_lookup = fn; + res = 0; + +unlock: + spin_unlock(&l3mdev_lock); + + return res; +} +EXPORT_SYMBOL_GPL(l3mdev_table_lookup_register); + +void l3mdev_table_lookup_unregister(enum l3mdev_type l3type, + lookup_by_table_id_t fn) +{ + struct l3mdev_handler *hdlr; + + if (l3mdev_check_type(l3type)) + return; + + hdlr = &l3mdev_handlers[l3type]; + + spin_lock(&l3mdev_lock); + + if (hdlr->dev_lookup == fn) + hdlr->dev_lookup = NULL; + + spin_unlock(&l3mdev_lock); +} +EXPORT_SYMBOL_GPL(l3mdev_table_lookup_unregister); + +int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, + struct net *net, u32 table_id) +{ + lookup_by_table_id_t lookup; + struct l3mdev_handler *hdlr; + int ifindex = -EINVAL; + int res; + + res = l3mdev_check_type(l3type); + if (res) + return res; + + hdlr = &l3mdev_handlers[l3type]; + + spin_lock(&l3mdev_lock); + + lookup = hdlr->dev_lookup; + if (!lookup) + goto unlock; + + ifindex = lookup(net, table_id); + +unlock: + spin_unlock(&l3mdev_lock); + + return ifindex; +} +EXPORT_SYMBOL_GPL(l3mdev_ifindex_lookup_by_table_id); + /** * l3mdev_master_ifindex - get index of L3 master device * @dev: targeted interface diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 6e53e43c1907..7180979114e4 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -984,7 +984,6 @@ out: * llc_ui_getname - return the address info of a socket * @sock: Socket to get address of. * @uaddr: Address structure to return information. - * @uaddrlen: Length of address structure. * @peer: Does user want local or remote address information. * * Return the address information of a socket. @@ -1054,7 +1053,7 @@ static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, * Set various connection specific parameters. */ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); @@ -1064,7 +1063,7 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, lock_sock(sk); if (unlikely(level != SOL_LLC || optlen != sizeof(int))) goto out; - rc = get_user(opt, (int __user *)optval); + rc = copy_from_sockptr(&opt, optval, sizeof(opt)); if (rc) goto out; rc = -EINVAL; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 7b620acaca9e..1144cda2a0fc 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -284,8 +284,8 @@ out:; /** * llc_conn_remove_acked_pdus - Removes acknowledged pdus from tx queue * @sk: active connection - * nr: NR - * how_many_unacked: size of pdu_unack_q after removing acked pdus + * @nr: NR + * @how_many_unacked: size of pdu_unack_q after removing acked pdus * * Removes acknowledged pdus from transmit queue (pdu_unack_q). Returns * the number of pdus that removed from queue. @@ -906,6 +906,7 @@ static void llc_sk_init(struct sock *sk) /** * llc_sk_alloc - Allocates LLC sock + * @net: network namespace * @family: upper layer protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @@ -951,7 +952,7 @@ void llc_sk_stop_all_timers(struct sock *sk, bool sync) /** * llc_sk_free - Frees a LLC socket - * @sk - socket to free + * @sk: - socket to free * * Frees a LLC socket */ diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 82cb93f66b9b..c309b72a5877 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -144,6 +144,7 @@ static inline int llc_fixup_skb(struct sk_buff *skb) * @skb: received pdu * @dev: device that receive pdu * @pt: packet type + * @orig_dev: the original receive net device * * When the system receives a 802.2 frame this function is called. It * checks SAP and connection of received pdu and passes frame to diff --git a/net/llc/llc_pdu.c b/net/llc/llc_pdu.c index 2e6cb79196bb..792d195c8bae 100644 --- a/net/llc/llc_pdu.c +++ b/net/llc/llc_pdu.c @@ -25,7 +25,7 @@ void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 pdu_type) /** * pdu_set_pf_bit - sets poll/final bit in LLC header - * @pdu_frame: input frame that p/f bit must be set into it. + * @skb: Frame to set bit in * @bit_value: poll/final bit (0 or 1). * * This function sets poll/final bit in LLC header (based on type of PDU). diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index be419062e19a..6805ce43a055 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -37,6 +37,7 @@ static int llc_mac_header_len(unsigned short devtype) /** * llc_alloc_frame - allocates sk_buff for frame + * @sk: socket to allocate frame to * @dev: network device this skb will be sent over * @type: pdu type to allocate * @data_size: data size to allocate @@ -273,6 +274,7 @@ void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb, * llc_sap_rcv - sends received pdus to the sap state machine * @sap: current sap component structure. * @skb: received frame. + * @sk: socket to associate to frame * * Sends received pdus to the sap state machine. */ @@ -379,6 +381,7 @@ static void llc_do_mcast(struct llc_sap *sap, struct sk_buff *skb, * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets. * @sap: SAP * @laddr: address of local LLC (MAC + SAP) + * @skb: PDU to deliver * * Search socket list of the SAP and finds connections with same sap. * Deliver clone to each. diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 7f245e9f114c..313ba97acae3 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -477,7 +477,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, size_t len) { u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num; - struct ieee802_11_elems elems = { 0 }; + struct ieee802_11_elems elems = { }; u8 dialog_token; int ies_len; diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c index 9fc2968856c0..366f76c9003d 100644 --- a/net/mac80211/airtime.c +++ b/net/mac80211/airtime.c @@ -551,7 +551,7 @@ EXPORT_SYMBOL_GPL(ieee80211_calc_tx_airtime); u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *pubsta, - int len) + int len, bool ampdu) { struct ieee80211_supported_band *sband; struct ieee80211_chanctx_conf *conf; @@ -572,10 +572,26 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, if (pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); - - return ieee80211_calc_tx_airtime_rate(hw, - &sta->tx_stats.last_rate, - band, len); + struct ieee80211_tx_rate *rate = &sta->tx_stats.last_rate; + u32 airtime; + + if (!(rate->flags & (IEEE80211_TX_RC_VHT_MCS | + IEEE80211_TX_RC_MCS))) + ampdu = false; + + /* + * Assume that HT/VHT transmission on any AC except VO will + * use aggregation. Since we don't have reliable reporting + * of aggregation length, assume an average of 16. + * This will not be very accurate, but much better than simply + * assuming un-aggregated tx. + */ + airtime = ieee80211_calc_tx_airtime_rate(hw, rate, band, + ampdu ? len * 16 : len); + if (ampdu) + airtime /= 16; + + return airtime; } if (!conf) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 1079a07e43e4..87fddd84c621 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -608,12 +608,12 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, case WLAN_CIPHER_SUITE_BIP_CMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_cmac)); - /* fall through */ + fallthrough; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_gmac)); - /* fall through */ + fallthrough; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != @@ -991,9 +991,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS | BSS_CHANGED_TXPOWER | - BSS_CHANGED_TWT | - BSS_CHANGED_HE_OBSS_PD | - BSS_CHANGED_HE_BSS_COLOR; + BSS_CHANGED_TWT; int i, err; int prev_beacon_int; @@ -1019,6 +1017,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.frame_time_rts_th = le32_get_bits(params->he_oper->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); + changed |= BSS_CHANGED_HE_OBSS_PD; + + if (params->he_bss_color.enabled) + changed |= BSS_CHANGED_HE_BSS_COLOR; } mutex_lock(&local->mtx); @@ -2126,6 +2128,11 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask)) conf->dot11MeshConnectedToMeshGate = nconf->dot11MeshConnectedToMeshGate; + if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) + conf->dot11MeshNolearn = nconf->dot11MeshNolearn; + if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_AS, mask)) + conf->dot11MeshConnectedToAuthServer = + nconf->dot11MeshConnectedToAuthServer; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } @@ -2337,7 +2344,7 @@ static int ieee80211_scan(struct wiphy *wiphy, * for now fall through to allow scanning only when * beaconing hasn't been configured yet */ - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: /* * If the scan has been forced (and the driver supports @@ -3584,7 +3591,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, } local_bh_disable(); - ieee80211_xmit(sdata, sta, skb, 0); + ieee80211_xmit(sdata, sta, skb); local_bh_enable(); ret = 0; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index e6e192f53e4e..bdc0f29dc6cd 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -313,9 +313,14 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, lockdep_assert_held(&local->chanctx_mtx); - /* don't optimize 5MHz, 10MHz, and radar_enabled confs */ + /* don't optimize non-20MHz based and radar_enabled confs */ if (ctx->conf.def.width == NL80211_CHAN_WIDTH_5 || ctx->conf.def.width == NL80211_CHAN_WIDTH_10 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_1 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_2 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_4 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_8 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_16 || ctx->conf.radar_enabled) { ctx->conf.min_def = ctx->conf.def; return; @@ -743,7 +748,7 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, default: WARN_ONCE(1, "Invalid SMPS mode %d\n", sdata->smps_mode); - /* fall through */ + fallthrough; case IEEE80211_SMPS_OFF: needed_static = sdata->needed_rx_chains; needed_dynamic = sdata->needed_rx_chains; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index d7e955127d5c..fe8a7a87e513 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -638,6 +638,9 @@ IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration, u.mesh.mshcfg.dot11MeshAwakeWindowDuration, DEC); IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate, u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC); +IEEE80211_IF_FILE(dot11MeshNolearn, u.mesh.mshcfg.dot11MeshNolearn, DEC); +IEEE80211_IF_FILE(dot11MeshConnectedToAuthServer, + u.mesh.mshcfg.dot11MeshConnectedToAuthServer, DEC); #endif #define DEBUGFS_ADD_MODE(name, mode) \ @@ -762,6 +765,8 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata) MESHPARAMS_ADD(power_mode); MESHPARAMS_ADD(dot11MeshAwakeWindowDuration); MESHPARAMS_ADD(dot11MeshConnectedToMeshGate); + MESHPARAMS_ADD(dot11MeshNolearn); + MESHPARAMS_ADD(dot11MeshConnectedToAuthServer); #undef MESHPARAMS_ADD } #endif diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index de69fc9c4f07..41d495d73d3a 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -12,12 +12,11 @@ #include "ieee80211_i.h" #include "trace.h" -static inline bool check_sdata_in_driver(struct ieee80211_sub_if_data *sdata) -{ - return !WARN(!(sdata->flags & IEEE80211_SDATA_IN_DRIVER), - "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", - sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); -} +#define check_sdata_in_driver(sdata) ({ \ + !WARN_ONCE(!(sdata->flags & IEEE80211_SDATA_IN_DRIVER), \ + "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", \ + sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); \ +}) static inline struct ieee80211_sub_if_data * get_bss_sdata(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index e32906202575..3d62a80b5790 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -250,7 +250,7 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.bss_conf.chandef.width) { default: WARN_ON_ONCE(1); - /* fall through */ + fallthrough; case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: bw = IEEE80211_STA_RX_BW_20; @@ -517,7 +517,7 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); - /* fall through */ + fallthrough; case IEEE80211_SMPS_OFF: action_frame->u.action.u.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DISABLED; diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 81d26fef41e9..53632c2f5217 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -791,7 +791,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20_NOHT: sta_flags |= IEEE80211_STA_DISABLE_HT; - /* fall through */ + fallthrough; case NL80211_CHAN_WIDTH_20: sta_flags |= IEEE80211_STA_DISABLE_40MHZ; break; @@ -1401,7 +1401,7 @@ ieee80211_ibss_setup_scan_channels(struct wiphy *wiphy, break; case NL80211_CHAN_WIDTH_80P80: cf2 = chandef->center_freq2; - /* fall through */ + fallthrough; case NL80211_CHAN_WIDTH_80: width = 80; break; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ec1a71ac65f2..0b1eaec6649f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -164,7 +164,6 @@ typedef unsigned __bitwise ieee80211_tx_result; #define TX_DROP ((__force ieee80211_tx_result) 1u) #define TX_QUEUED ((__force ieee80211_tx_result) 2u) -#define IEEE80211_TX_NO_SEQNO BIT(0) #define IEEE80211_TX_UNICAST BIT(1) #define IEEE80211_TX_PS_BUFFERED BIT(2) @@ -218,7 +217,7 @@ enum ieee80211_rx_flags { }; struct ieee80211_rx_data { - struct napi_struct *napi; + struct list_head *list; struct sk_buff *skb; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; @@ -1967,12 +1966,11 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb, - u32 txdata_flags); + struct sta_info *sta, struct sk_buff *skb); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band, u32 txdata_flags); + enum nl80211_band band); /* sta_out needs to be checked for ERR_PTR() before using */ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, @@ -1982,10 +1980,10 @@ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, static inline void ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band, u32 txdata_flags) + enum nl80211_band band) { rcu_read_lock(); - __ieee80211_tx_skb_tid_band(sdata, skb, tid, band, txdata_flags); + __ieee80211_tx_skb_tid_band(sdata, skb, tid, band); rcu_read_unlock(); } @@ -2003,7 +2001,7 @@ static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, } __ieee80211_tx_skb_tid_band(sdata, skb, tid, - chanctx_conf->def.chan->band, 0); + chanctx_conf->def.chan->band); rcu_read_unlock(); } @@ -2290,7 +2288,7 @@ extern const struct ethtool_ops ieee80211_ethtool_ops; u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *pubsta, - int len); + int len, bool ampdu); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline #else diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index f900c84fb40f..9740ae8fa697 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -978,7 +978,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_P2P_DEVICE: /* relies on synchronize_rcu() below */ RCU_INIT_POINTER(local->p2p_sdata, NULL); - /* fall through */ + fallthrough; default: cancel_work_sync(&sdata->work); /* @@ -1048,7 +1048,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) break; - /* fall through */ + fallthrough; default: if (going_down) drv_remove_interface(local, sdata); @@ -1183,17 +1183,24 @@ static u16 ieee80211_monitor_select_queue(struct net_device *dev, { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; - struct ieee80211_radiotap_header *rtap = (void *)skb->data; + int len_rthdr; if (local->hw.queues < IEEE80211_NUM_ACS) return 0; - if (skb->len < 4 || - skb->len < le16_to_cpu(rtap->it_len) + 2 /* frame control */) + /* reset flags and info before parsing radiotap header */ + memset(info, 0, sizeof(*info)); + + if (!ieee80211_parse_tx_radiotap(skb, dev)) return 0; /* doesn't matter, frame will be dropped */ - hdr = (void *)((u8 *)skb->data + le16_to_cpu(rtap->it_len)); + len_rthdr = ieee80211_get_radiotap_len(skb->data); + hdr = (struct ieee80211_hdr *)(skb->data + len_rthdr); + if (skb->len < len_rthdr + 2 || + skb->len < len_rthdr + ieee80211_hdrlen(hdr->frame_control)) + return 0; /* doesn't matter, frame will be dropped */ return ieee80211_select_queue_80211(sdata, skb, hdr); } @@ -1497,7 +1504,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, type = NL80211_IFTYPE_AP; sdata->vif.type = type; sdata->vif.p2p = true; - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: skb_queue_head_init(&sdata->u.ap.ps.bc_buf); INIT_LIST_HEAD(&sdata->u.ap.vlans); @@ -1507,7 +1514,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, type = NL80211_IFTYPE_STATION; sdata->vif.type = type; sdata->vif.p2p = true; - /* fall through */ + fallthrough; case NL80211_IFTYPE_STATION: sdata->vif.bss_conf.bssid = sdata->u.mgd.bssid; ieee80211_sta_setup_sdata(sdata); @@ -1703,7 +1710,7 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, goto out_unlock; } } - /* fall through */ + fallthrough; default: /* assign a new address if possible -- try n_addresses first */ for (i = 0; i < local->hw.wiphy->n_addresses; i++) { diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 8f403c1bb908..9c2888004878 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -225,7 +225,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) */ if (sdata->hw_80211_encap) return -EINVAL; - /* Fall through */ + fallthrough; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index e88beb3ff6db..7ecd801a943b 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -260,6 +260,7 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, bool is_connected_to_gate = ifmsh->num_gates > 0 || ifmsh->mshcfg.dot11MeshGateAnnouncementProtocol || ifmsh->mshcfg.dot11MeshConnectedToMeshGate; + bool is_connected_to_as = ifmsh->mshcfg.dot11MeshConnectedToAuthServer; if (skb_tailroom(skb) < 2 + meshconf_len) return -ENOMEM; @@ -284,7 +285,9 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, /* Mesh Formation Info - number of neighbors */ neighbors = atomic_read(&ifmsh->estab_plinks); neighbors = min_t(int, neighbors, IEEE80211_MAX_MESH_PEERINGS); - *pos++ = (neighbors << 1) | is_connected_to_gate; + *pos++ = (is_connected_to_as << 7) | + (neighbors << 1) | + is_connected_to_gate; /* Mesh capability */ *pos = 0x00; *pos |= ifmsh->mshcfg.dot11MeshForwarding ? @@ -1107,10 +1110,10 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.bss_conf.chandef.width) { case NL80211_CHAN_WIDTH_20_NOHT: sta_flags |= IEEE80211_STA_DISABLE_HT; - /* fall through */ + fallthrough; case NL80211_CHAN_WIDTH_20: sta_flags |= IEEE80211_STA_DISABLE_40MHZ; - /* fall through */ + fallthrough; case NL80211_CHAN_WIDTH_40: sta_flags |= IEEE80211_STA_DISABLE_VHT; break; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 02cde0fd08fe..bec23d2eee7a 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1173,6 +1173,40 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, } /** + * mesh_nexthop_lookup_nolearn - try to set next hop without path discovery + * @skb: 802.11 frame to be sent + * @sdata: network subif the frame will be sent through + * + * Check if the meshDA (addr3) of a unicast frame is a direct neighbor. + * And if so, set the RA (addr1) to it to transmit to this node directly, + * avoiding PREQ/PREP path discovery. + * + * Returns: 0 if the next hop was found and -ENOENT otherwise. + */ +static int mesh_nexthop_lookup_nolearn(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + struct sta_info *sta; + + if (is_multicast_ether_addr(hdr->addr1)) + return -ENOENT; + + rcu_read_lock(); + sta = sta_info_get(sdata, hdr->addr3); + + if (!sta || sta->mesh->plink_state != NL80211_PLINK_ESTAB) { + rcu_read_unlock(); + return -ENOENT; + } + rcu_read_unlock(); + + memcpy(hdr->addr1, hdr->addr3, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + return 0; +} + +/** * mesh_nexthop_lookup - put the appropriate next hop on a mesh frame. Calling * this function is considered "using" the associated mpath, so preempt a path * refresh if this mpath expires soon. @@ -1185,11 +1219,16 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_path *mpath; struct sta_info *next_hop; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; u8 *target_addr = hdr->addr3; + if (ifmsh->mshcfg.dot11MeshNolearn && + !mesh_nexthop_lookup_nolearn(sdata, skb)) + return 0; + mpath = mesh_path_lookup(sdata, target_addr); if (!mpath || !(mpath->flags & MESH_PATH_ACTIVE)) return -ENOENT; @@ -1266,7 +1305,7 @@ void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata) break; case IEEE80211_PROACTIVE_PREQ_WITH_PREP: flags |= IEEE80211_PREQ_PROACTIVE_PREP_FLAG; - /* fall through */ + fallthrough; case IEEE80211_PROACTIVE_PREQ_NO_PREP: interval = ifmsh->mshcfg.dot11MeshHWMPactivePathToRootTimeout; target_flags |= IEEE80211_PREQ_TO_FLAG | diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index aca608ae313f..48f31ac9233c 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -72,7 +72,6 @@ static void mesh_table_free(struct mesh_table *tbl) } /** - * * mesh_path_assign_nexthop - update mesh path next hop * * @mpath: mesh path to update @@ -140,7 +139,6 @@ static void prepare_for_gate(struct sk_buff *skb, char *dst_addr, } /** - * * mesh_path_move_to_queue - Move or copy frames from one mpath queue to another * * This function is used to transfer or copy frames from an unresolved mpath to @@ -152,7 +150,7 @@ static void prepare_for_gate(struct sk_buff *skb, char *dst_addr, * * The gate mpath must be an active mpath with a valid mpath->next_hop. * - * @mpath: An active mpath the frames will be sent to (i.e. the gate) + * @gate_mpath: An active mpath the frames will be sent to (i.e. the gate) * @from_mpath: The failed mpath * @copy: When true, copy all the frames to the new mpath queue. When false, * move them. diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 798e4b6b383f..15f2fc658f70 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -699,7 +699,7 @@ void mesh_plink_timer(struct timer_list *t) break; } reason = WLAN_REASON_MESH_MAX_RETRIES; - /* fall through */ + fallthrough; case NL80211_PLINK_CNF_RCVD: /* confirm timer */ if (!reason) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b2a9d47cf86d..ac870309b911 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -533,7 +533,7 @@ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata, case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); - /* fall through */ + fallthrough; case IEEE80211_SMPS_OFF: cap |= WLAN_HT_CAP_SM_PS_DISABLED << IEEE80211_HT_CAP_SM_PS_SHIFT; @@ -1529,7 +1529,7 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, switch (channel->band) { default: WARN_ON_ONCE(1); - /* fall through */ + fallthrough; case NL80211_BAND_2GHZ: case NL80211_BAND_60GHZ: chan_increment = 1; @@ -2988,7 +2988,10 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); if (auth_alg == WLAN_AUTH_SAE && - status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED) + (status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED || + (auth_transaction == 1 && + (status_code == WLAN_STATUS_SAE_HASH_TO_ELEMENT || + status_code == WLAN_STATUS_SAE_PK)))) return; sdata_info(sdata, "%pM denied authentication (status %d)\n", @@ -3460,10 +3463,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, bss_conf->he_bss_color.partial = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR); - bss_conf->he_bss_color.disabled = - le32_get_bits(elems->he_operation->he_oper_params, - IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); - changed |= BSS_CHANGED_HE_BSS_COLOR; + bss_conf->he_bss_color.enabled = + !le32_get_bits(elems->he_operation->he_oper_params, + IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); + + if (bss_conf->he_bss_color.enabled) + changed |= BSS_CHANGED_HE_BSS_COLOR; bss_conf->htc_trig_based_pkt_ext = le32_get_bits(elems->he_operation->he_oper_params, @@ -4558,6 +4563,9 @@ static void ieee80211_sta_bcn_mon_timer(struct timer_list *t) if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn) return; + if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) + return; + sdata->u.mgd.connection_loss = false; ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.beacon_connection_loss_work); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index db3b8bf75656..f470d1a7ce9b 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -264,7 +264,7 @@ static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, if (roc->mgmt_tx_cookie) { if (!WARN_ON(!roc->frame)) { ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7, - roc->chan->band, 0); + roc->chan->band); roc->frame = NULL; } } else { @@ -808,13 +808,13 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, if (!sdata->vif.bss_conf.ibss_joined) need_offchan = true; #ifdef CONFIG_MAC80211_MESH - /* fall through */ + fallthrough; case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.mesh_id_len) need_offchan = true; #endif - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_GO: diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 5c5af4b5fc08..836cde516a18 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2578,8 +2578,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb, memset(skb->cb, 0, sizeof(skb->cb)); /* deliver to local stack */ - if (rx->napi) - napi_gro_receive(rx->napi, skb); + if (rx->list) + list_add_tail(&skb->list, rx->list); else netif_receive_skb(skb); } @@ -3591,7 +3591,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) } __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7, - status->band, 0); + status->band); } dev_kfree_skb(rx->skb); return RX_QUEUED; @@ -3736,7 +3736,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); if (rx->sta) rx->sta->rx_stats.dropped++; - /* fall through */ + fallthrough; case RX_CONTINUE: { struct ieee80211_rate *rate = NULL; struct ieee80211_supported_band *sband; @@ -3869,7 +3869,6 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) /* This is OK -- must be QoS data frame */ .security_idx = tid, .seqno_idx = tid, - .napi = NULL, /* must be NULL to not have races */ }; struct tid_ampdu_rx *tid_agg_rx; @@ -4479,8 +4478,8 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, /* deliver to local stack */ skb->protocol = eth_type_trans(skb, fast_rx->dev); memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->napi) - napi_gro_receive(rx->napi, skb); + if (rx->list) + list_add_tail(&skb->list, rx->list); else netif_receive_skb(skb); @@ -4547,7 +4546,7 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct sk_buff *skb, - struct napi_struct *napi) + struct list_head *list) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; @@ -4562,7 +4561,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, memset(&rx, 0, sizeof(rx)); rx.skb = skb; rx.local = local; - rx.napi = napi; + rx.list = list; if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) I802_DEBUG_INC(local->dot11ReceivedFragmentCount); @@ -4670,8 +4669,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, * This is the receive path handler. It is called by a low level driver when an * 802.11 MPDU is received from the hardware. */ -void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, - struct sk_buff *skb, struct napi_struct *napi) +void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, + struct sk_buff *skb, struct list_head *list) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate = NULL; @@ -4752,7 +4751,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, break; default: WARN_ON_ONCE(1); - /* fall through */ + fallthrough; case RX_ENC_LEGACY: if (WARN_ON(status->rate_idx >= sband->n_bitrates)) goto drop; @@ -4763,36 +4762,53 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rx_flags = 0; /* - * key references and virtual interfaces are protected using RCU - * and this requires that we are in a read-side RCU section during - * receive processing - */ - rcu_read_lock(); - - /* * Frames with failed FCS/PLCP checksum are not returned, * all other frames are returned without radiotap header * if it was previously present. * Also, frames with less than 16 bytes are dropped. */ skb = ieee80211_rx_monitor(local, skb, rate); - if (!skb) { - rcu_read_unlock(); + if (!skb) return; - } ieee80211_tpt_led_trig_rx(local, ((struct ieee80211_hdr *)skb->data)->frame_control, skb->len); - __ieee80211_rx_handle_packet(hw, pubsta, skb, napi); - - rcu_read_unlock(); + __ieee80211_rx_handle_packet(hw, pubsta, skb, list); return; drop: kfree_skb(skb); } +EXPORT_SYMBOL(ieee80211_rx_list); + +void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, + struct sk_buff *skb, struct napi_struct *napi) +{ + struct sk_buff *tmp; + LIST_HEAD(list); + + + /* + * key references and virtual interfaces are protected using RCU + * and this requires that we are in a read-side RCU section during + * receive processing + */ + rcu_read_lock(); + ieee80211_rx_list(hw, pubsta, skb, &list); + rcu_read_unlock(); + + if (!napi) { + netif_receive_skb_list(&list); + return; + } + + list_for_each_entry_safe(skb, tmp, &list, list) { + skb_list_del_init(skb); + napi_gro_receive(napi, skb); + } +} EXPORT_SYMBOL(ieee80211_rx_napi); /* This is a version of the rx handler that can be called from hard irq diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index ad90bbe57457..5ac2785cdc7b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -591,7 +591,6 @@ static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel) { struct sk_buff *skb; - u32 txdata_flags = 0; skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, ssid, ssid_len, @@ -600,15 +599,15 @@ static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, if (skb) { if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { struct ieee80211_hdr *hdr = (void *)skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 sn = get_random_u32(); - txdata_flags |= IEEE80211_TX_NO_SEQNO; + info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; hdr->seq_ctrl = cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); } IEEE80211_SKB_CB(skb)->flags |= tx_flags; - ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band, - txdata_flags); + ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); } } @@ -913,6 +912,7 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, case NL80211_BSS_CHAN_WIDTH_10: local->scan_chandef.width = NL80211_CHAN_WIDTH_10; break; + default: case NL80211_BSS_CHAN_WIDTH_20: /* If scanning on oper channel, use whatever channel-type * is currently in use. diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index af4cc5fb678e..f2840d1d95cf 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1050,7 +1050,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta) might_sleep(); lockdep_assert_held(&local->sta_mtx); - while (sta->sta_state == IEEE80211_STA_AUTHORIZED) { + if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); WARN_ON_ONCE(ret); } @@ -1455,7 +1455,7 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid, } info->band = chanctx_conf->def.chan->band; - ieee80211_xmit(sdata, sta, skb, 0); + ieee80211_xmit(sdata, sta, skb); rcu_read_unlock(); } @@ -2424,7 +2424,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | - BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE); + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; @@ -2437,6 +2438,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; + sinfo->connected_to_as = sta->mesh->connected_to_as; #endif } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 49728047dfad..9d398c9daa4c 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -385,6 +385,7 @@ DECLARE_EWMA(mesh_tx_rate_avg, 8, 16) * @processed_beacon: set to true after peer rates and capabilities are * processed * @connected_to_gate: true if mesh STA has a path to a mesh gate + * @connected_to_as: true if mesh STA has a path to a authentication server * @fail_avg: moving percentage of failed MSDUs * @tx_rate_avg: moving average of tx bitrate */ @@ -404,6 +405,7 @@ struct mesh_sta { bool processed_beacon; bool connected_to_gate; + bool connected_to_as; enum nl80211_plink_state plink_state; u32 plink_timeout; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index cbc40b358ba2..adb1d30ce06e 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -799,7 +799,6 @@ static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int *retry_count) { - int rates_idx = -1; int count = -1; int i; @@ -821,13 +820,12 @@ static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, count += info->status.rates[i].count; } - rates_idx = i - 1; if (count < 0) count = 0; *retry_count = count; - return rates_idx; + return i - 1; } void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 4b0cff4a07bd..e01e4daeb8cd 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -239,7 +239,7 @@ static enum ieee80211_ac_numbers ieee80211_ac_from_wmm(int ac) switch (ac) { default: WARN_ON_ONCE(1); - /* fall through */ + fallthrough; case 0: return IEEE80211_AC_BE; case 1: @@ -952,7 +952,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, set_sta_flag(sta, WLAN_STA_TDLS_INITIATOR); sta->sta.tdls_initiator = false; } - /* fall-through */ + fallthrough; case WLAN_TDLS_SETUP_CONFIRM: case WLAN_TDLS_DISCOVERY_REQUEST: initiator = true; @@ -967,7 +967,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, clear_sta_flag(sta, WLAN_STA_TDLS_INITIATOR); sta->sta.tdls_initiator = true; } - /* fall-through */ + fallthrough; case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: initiator = false; break; @@ -1222,7 +1222,7 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, * by the AP. */ drv_mgd_protect_tdls_discover(sdata->local, sdata); - /* fall-through */ + fallthrough; case WLAN_TDLS_SETUP_CONFIRM: case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: /* no special handling */ diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 1b4709694d2a..50ab5b9d8eab 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -22,7 +22,8 @@ #define LOCAL_PR_ARG __entry->wiphy_name #define STA_ENTRY __array(char, sta_addr, ETH_ALEN) -#define STA_ASSIGN (sta ? memcpy(__entry->sta_addr, sta->addr, ETH_ALEN) : memset(__entry->sta_addr, 0, ETH_ALEN)) +#define STA_ASSIGN (sta ? memcpy(__entry->sta_addr, sta->addr, ETH_ALEN) : \ + eth_zero_addr(__entry->sta_addr)) #define STA_NAMED_ASSIGN(s) memcpy(__entry->sta_addr, (s)->addr, ETH_ALEN) #define STA_PR_FMT " sta:%pM" #define STA_PR_ARG __entry->sta_addr diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3529d1368068..dca01d7e6e3e 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -166,6 +166,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (r->flags & IEEE80211_RATE_MANDATORY_A) mrate = r->bitrate; break; + case NL80211_BAND_S1GHZ: case NL80211_BAND_60GHZ: /* TODO, for now fall through */ case NUM_NL80211_BANDS: @@ -824,6 +825,9 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) if (ieee80211_is_qos_nullfunc(hdr->frame_control)) return TX_CONTINUE; + if (info->control.flags & IEEE80211_TX_CTRL_NO_SEQNO) + return TX_CONTINUE; + /* * Anything but QoS data that has a sequence number field * (is long enough) gets a sequence number from the global @@ -832,8 +836,6 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) */ if (!ieee80211_is_data_qos(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) { - if (tx->flags & IEEE80211_TX_NO_SEQNO) - return TX_CONTINUE; /* driver should assign sequence number */ info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; /* for pure STA mode without beacons, we can do it */ @@ -1739,7 +1741,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, case NL80211_IFTYPE_AP_VLAN: sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); - /* fall through */ + fallthrough; default: vif = &sdata->vif; break; @@ -1890,7 +1892,7 @@ EXPORT_SYMBOL(ieee80211_tx_prepare_skb); */ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb, - bool txpending, u32 txdata_flags) + bool txpending) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_data tx; @@ -1908,8 +1910,6 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, led_len = skb->len; res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb); - tx.flags |= txdata_flags; - if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); return true; @@ -1977,8 +1977,7 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, } void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb, - u32 txdata_flags) + struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); @@ -2013,12 +2012,13 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, } ieee80211_set_qos_hdr(sdata, skb); - ieee80211_tx(sdata, sta, skb, false, txdata_flags); + ieee80211_tx(sdata, sta, skb, false); } -static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, - struct sk_buff *skb) +bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, + struct net_device *dev) { + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_radiotap_iterator iterator; struct ieee80211_radiotap_header *rthdr = (struct ieee80211_radiotap_header *) skb->data; @@ -2037,6 +2037,18 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, u8 vht_mcs = 0, vht_nss = 0; int i; + /* check for not even having the fixed radiotap header part */ + if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) + return false; /* too short to be possibly valid */ + + /* is it a header version we can trust to find length from? */ + if (unlikely(rthdr->it_version)) + return false; /* only version 0 is supported */ + + /* does the skb contain enough to deliver on the alleged length? */ + if (unlikely(skb->len < ieee80211_get_radiotap_len(skb->data))) + return false; /* skb too short for claimed rt header extent */ + info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_CTL_DONTFRAG; @@ -2084,6 +2096,8 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, txflags = get_unaligned_le16(iterator.this_arg); if (txflags & IEEE80211_RADIOTAP_F_TX_NOACK) info->flags |= IEEE80211_TX_CTL_NO_ACK; + if (txflags & IEEE80211_RADIOTAP_F_TX_NOSEQNO) + info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; break; case IEEE80211_RADIOTAP_RATE: @@ -2188,13 +2202,6 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, local->hw.max_rate_tries); } - /* - * remove the radiotap header - * iterator->_max_length was sanity-checked against - * skb->len by iterator init - */ - skb_pull(skb, iterator._max_length); - return true; } @@ -2203,8 +2210,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_chanctx_conf *chanctx_conf; - struct ieee80211_radiotap_header *prthdr = - (struct ieee80211_radiotap_header *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; struct ieee80211_sub_if_data *tmp_sdata, *sdata; @@ -2212,21 +2217,17 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, u16 len_rthdr; int hdrlen; - /* check for not even having the fixed radiotap header part */ - if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) - goto fail; /* too short to be possibly valid */ + memset(info, 0, sizeof(*info)); + info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | + IEEE80211_TX_CTL_INJECTED; - /* is it a header version we can trust to find length from? */ - if (unlikely(prthdr->it_version)) - goto fail; /* only version 0 is supported */ + /* Sanity-check and process the injection radiotap header */ + if (!ieee80211_parse_tx_radiotap(skb, dev)) + goto fail; - /* then there must be a radiotap header with a length we can use */ + /* we now know there is a radiotap header with a length we can use */ len_rthdr = ieee80211_get_radiotap_len(skb->data); - /* does the skb contain enough to deliver on the alleged length? */ - if (unlikely(skb->len < len_rthdr)) - goto fail; /* skb too short for claimed rt header extent */ - /* * fix up the pointers accounting for the radiotap * header still being in there. We are being given @@ -2272,11 +2273,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; } - memset(info, 0, sizeof(*info)); - - info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | - IEEE80211_TX_CTL_INJECTED; - rcu_read_lock(); /* @@ -2342,11 +2338,10 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, info->band = chandef->chan->band; - /* process and remove the injection radiotap header */ - if (!ieee80211_parse_tx_radiotap(local, skb)) - goto fail_rcu; + /* remove the injection radiotap header */ + skb_pull(skb, len_rthdr); - ieee80211_xmit(sdata, NULL, skb, 0); + ieee80211_xmit(sdata, NULL, skb); rcu_read_unlock(); return NETDEV_TX_OK; @@ -2382,7 +2377,7 @@ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, } else if (sdata->wdev.use_4addr) { return -ENOLINK; } - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_ADHOC: @@ -2552,7 +2547,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, band = chanctx_conf->def.chan->band; if (sdata->wdev.use_4addr) break; - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: if (sdata->vif.type == NL80211_IFTYPE_AP) chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); @@ -2999,7 +2994,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) build.hdr_len = 30; break; } - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); /* DA BSSID SA */ @@ -3618,7 +3613,7 @@ begin: tx.skb = skb; tx.sdata = vif_to_sdata(info->control.vif); - if (txq->sta) { + if (txq->sta && !(info->flags & IEEE80211_TX_CTL_INJECTED)) { tx.sta = container_of(txq->sta, struct sta_info, sta); /* * Drop unicast frames to unauthorised stations unless they are @@ -3710,7 +3705,7 @@ begin: case NL80211_IFTYPE_AP_VLAN: tx.sdata = container_of(tx.sdata->bss, struct ieee80211_sub_if_data, u.ap); - /* fall through */ + fallthrough; default: vif = &tx.sdata->vif; break; @@ -3721,10 +3716,11 @@ encap_out: if (vif && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) { + bool ampdu = txq->ac != IEEE80211_AC_VO; u32 airtime; airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta, - skb->len); + skb->len, ampdu); if (airtime) { airtime = ieee80211_info_set_tx_time_est(info, airtime); ieee80211_sta_update_pending_airtime(local, tx.sta, @@ -3950,6 +3946,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, if (local->ops->wake_tx_queue) { u16 queue = __ieee80211_select_queue(sdata, sta, skb); skb_set_queue_mapping(skb, queue); + skb_get_hash(skb); } if (sta) { @@ -4008,7 +4005,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, ieee80211_tx_stats(dev, skb->len); - ieee80211_xmit(sdata, sta, skb, 0); + ieee80211_xmit(sdata, sta, skb); } goto out; out_free: @@ -4049,7 +4046,7 @@ static bool ieee80211_multicast_to_unicast(struct sk_buff *skb, return false; if (sdata->wdev.use_4addr) return false; - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: /* check runtime toggle for this bss */ if (!sdata->bss->multicast_to_unicast) @@ -4375,7 +4372,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, return true; } info->band = chanctx_conf->def.chan->band; - result = ieee80211_tx(sdata, NULL, skb, true, 0); + result = ieee80211_tx(sdata, NULL, skb, true); } else if (info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP) { if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { dev_kfree_skb(skb); @@ -5334,7 +5331,7 @@ EXPORT_SYMBOL(ieee80211_unreserve_tid); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band, u32 txdata_flags) + enum nl80211_band band) { int ac = ieee80211_ac_from_tid(tid); @@ -5351,7 +5348,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, */ local_bh_disable(); IEEE80211_SKB_CB(skb)->band = band; - ieee80211_xmit(sdata, NULL, skb, txdata_flags); + ieee80211_xmit(sdata, NULL, skb); local_bh_enable(); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index dd9f5c7a1ade..c8504ffc71a1 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2347,10 +2347,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) case NL80211_IFTYPE_ADHOC: if (sdata->vif.bss_conf.ibss_joined) WARN_ON(drv_join_ibss(local, sdata)); - /* fall through */ + fallthrough; default: ieee80211_reconfig_stations(sdata); - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: /* AP stations are handled later */ for (i = 0; i < IEEE80211_NUM_ACS; i++) drv_conf_tx(local, sdata, i, @@ -2399,7 +2399,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) break; case NL80211_IFTYPE_ADHOC: changed |= BSS_CHANGED_IBSS; - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: changed |= BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS; @@ -2414,8 +2414,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) if (rcu_access_pointer(sdata->u.ap.beacon)) drv_start_ap(local, sdata); } - - /* fall through */ + fallthrough; case NL80211_IFTYPE_MESH_POINT: if (sdata->vif.bss_conf.enable_beacon) { changed |= BSS_CHANGED_BEACON | @@ -2889,7 +2888,7 @@ void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); - /* fall through */ + fallthrough; case IEEE80211_SMPS_OFF: cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, IEEE80211_HE_6GHZ_CAP_SM_PS); @@ -3200,7 +3199,7 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, break; case 0x01: support_80_80 = false; - /* fall through */ + fallthrough; case 0x02: case 0x03: ccf1 = ccfs2; @@ -3570,7 +3569,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, break; default: WARN_ON(1); - /* fall through */ + fallthrough; case RX_ENC_LEGACY: { struct ieee80211_supported_band *sband; int shift = 0; @@ -3734,6 +3733,11 @@ u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c) c->width = NL80211_CHAN_WIDTH_20_NOHT; ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; break; + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: WARN_ON_ONCE(1); diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 72920d82928c..2fb99325135a 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -198,7 +198,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, sta = rcu_dereference(sdata->u.vlan.sta); if (sta) break; - /* fall through */ + fallthrough; case NL80211_IFTYPE_AP: ra = skb->data; break; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index fd30ea61336e..6fdd0c9f865a 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1584,21 +1584,10 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, unsigned int flags; if (event == NETDEV_REGISTER) { + mdev = mpls_add_dev(dev); + if (IS_ERR(mdev)) + return notifier_from_errno(PTR_ERR(mdev)); - /* For now just support Ethernet, IPGRE, IP6GRE, SIT and - * IPIP devices - */ - if (dev->type == ARPHRD_ETHER || - dev->type == ARPHRD_LOOPBACK || - dev->type == ARPHRD_IPGRE || - dev->type == ARPHRD_IP6GRE || - dev->type == ARPHRD_SIT || - dev->type == ARPHRD_TUNNEL || - dev->type == ARPHRD_TUNNEL6) { - mdev = mpls_add_dev(dev); - if (IS_ERR(mdev)) - return notifier_from_errno(PTR_ERR(mdev)); - } return NOTIFY_OK; } diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index a9ed3bf1d93f..698bc3525160 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -13,17 +13,29 @@ config MPTCP if MPTCP +config INET_MPTCP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG + config MPTCP_IPV6 bool "MPTCP: IPv6 support for Multipath TCP" select IPV6 default y -config MPTCP_HMAC_TEST - bool "Tests for MPTCP HMAC implementation" +endif + +config MPTCP_KUNIT_TESTS + tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS + select MPTCP + depends on KUNIT + default KUNIT_ALL_TESTS help - This option enable boot time self-test for the HMAC implementation - used by the MPTCP code + Currently covers the MPTCP crypto and token helpers. + Only useful for kernel devs running KUnit test harness and are not + for inclusion into a production build. - Say N if you are unsure. + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. -endif diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index baa0640527c7..a611968be4d7 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -3,3 +3,10 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o + +obj-$(CONFIG_SYN_COOKIES) += syncookies.o +obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o + +mptcp_crypto_test-objs := crypto_test.o +mptcp_token_test-objs := token_test.o +obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index 82bd2b54d741..05d398d3fde4 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -78,65 +78,6 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac); } -#ifdef CONFIG_MPTCP_HMAC_TEST -struct test_cast { - char *key; - char *msg; - char *result; -}; - -/* we can't reuse RFC 4231 test vectors, as we have constraint on the - * input and key size. - */ -static struct test_cast tests[] = { - { - .key = "0b0b0b0b0b0b0b0b", - .msg = "48692054", - .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa", - }, - { - .key = "aaaaaaaaaaaaaaaa", - .msg = "dddddddd", - .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9", - }, - { - .key = "0102030405060708", - .msg = "cdcdcdcd", - .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d", - }, -}; - -static int __init test_mptcp_crypto(void) -{ - char hmac[32], hmac_hex[65]; - u32 nonce1, nonce2; - u64 key1, key2; - u8 msg[8]; - int i, j; - - for (i = 0; i < ARRAY_SIZE(tests); ++i) { - /* mptcp hmap will convert to be before computing the hmac */ - key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); - key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); - nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); - nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); - - put_unaligned_be32(nonce1, &msg[0]); - put_unaligned_be32(nonce2, &msg[4]); - - mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); - for (j = 0; j < 32; ++j) - sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); - hmac_hex[64] = 0; - - if (memcmp(hmac_hex, tests[i].result, 64)) - pr_err("test %d failed, got %s expected %s", i, - hmac_hex, tests[i].result); - else - pr_info("test %d [ ok ]", i); - } - return 0; -} - -late_initcall(test_mptcp_crypto); +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +EXPORT_SYMBOL_GPL(mptcp_crypto_hmac_sha); #endif diff --git a/net/mptcp/crypto_test.c b/net/mptcp/crypto_test.c new file mode 100644 index 000000000000..017248dea038 --- /dev/null +++ b/net/mptcp/crypto_test.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <kunit/test.h> + +#include "protocol.h" + +struct test_case { + char *key; + char *msg; + char *result; +}; + +/* we can't reuse RFC 4231 test vectors, as we have constraint on the + * input and key size. + */ +static struct test_case tests[] = { + { + .key = "0b0b0b0b0b0b0b0b", + .msg = "48692054", + .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa", + }, + { + .key = "aaaaaaaaaaaaaaaa", + .msg = "dddddddd", + .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9", + }, + { + .key = "0102030405060708", + .msg = "cdcdcdcd", + .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d", + }, +}; + +static void mptcp_crypto_test_basic(struct kunit *test) +{ + char hmac[32], hmac_hex[65]; + u32 nonce1, nonce2; + u64 key1, key2; + u8 msg[8]; + int i, j; + + for (i = 0; i < ARRAY_SIZE(tests); ++i) { + /* mptcp hmap will convert to be before computing the hmac */ + key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); + key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); + nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); + nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); + + put_unaligned_be32(nonce1, &msg[0]); + put_unaligned_be32(nonce2, &msg[4]); + + mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); + for (j = 0; j < 32; ++j) + sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); + hmac_hex[64] = 0; + + KUNIT_EXPECT_STREQ(test, &hmac_hex[0], tests[i].result); + } +} + +static struct kunit_case mptcp_crypto_test_cases[] = { + KUNIT_CASE(mptcp_crypto_test_basic), + {} +}; + +static struct kunit_suite mptcp_crypto_suite = { + .name = "mptcp-crypto", + .test_cases = mptcp_crypto_test_cases, +}; + +kunit_test_suite(mptcp_crypto_suite); + +MODULE_LICENSE("GPL"); diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 8e39585d37f3..54b888f94009 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -112,6 +112,7 @@ static struct pernet_operations mptcp_pernet_ops = { void __init mptcp_init(void) { + mptcp_join_cookie_init(); mptcp_proto_init(); if (register_pernet_subsys(&mptcp_pernet_ops) < 0) diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c new file mode 100644 index 000000000000..5f390a97f556 --- /dev/null +++ b/net/mptcp/mptcp_diag.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP socket monitoring support + * + * Copyright (c) 2020 Red Hat + * + * Author: Paolo Abeni <pabeni@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/net.h> +#include <linux/inet_diag.h> +#include <net/netlink.h> +#include <uapi/linux/mptcp.h> +#include "protocol.h" + +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + struct nlattr *bc, bool net_admin) +{ + if (!inet_diag_bc_sk(bc, sk)) + return 0; + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, + net_admin); +} + +static int mptcp_diag_dump_one(struct netlink_callback *cb, + const struct inet_diag_req_v2 *req) +{ + struct sk_buff *in_skb = cb->skb; + struct mptcp_sock *msk = NULL; + struct sk_buff *rep; + int err = -ENOENT; + struct net *net; + struct sock *sk; + + net = sock_net(in_skb->sk); + msk = mptcp_token_get_sock(req->id.idiag_cookie[0]); + if (!msk) + goto out_nosk; + + err = -ENOMEM; + sk = (struct sock *)msk; + rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct mptcp_info)) + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, + GFP_KERNEL); + if (!rep) + goto out; + + err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(rep); + goto out; + } + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + sock_put(sk); + +out_nosk: + return err; +} + +static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r) +{ + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; + struct mptcp_sock *msk; + struct nlattr *bc; + + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; + + while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) != + NULL) { + struct inet_sock *inet = (struct inet_sock *)msk; + struct sock *sk = (struct sock *)msk; + int ret = 0; + + if (!(r->idiag_states & (1 << sk->sk_state))) + goto next; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next; + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + + ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); +next: + sock_put(sk); + if (ret < 0) { + /* will retry on the same position */ + cb->args[1]--; + break; + } + cond_resched(); + } +} + +static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_info *info = _info; + u32 flags = 0; + bool slow; + u8 val; + + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); + if (!info) + return; + + slow = lock_sock_fast(sk); + info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); + info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); + info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max); + val = READ_ONCE(msk->pm.add_addr_signal_max); + info->mptcpi_add_addr_signal_max = val; + val = READ_ONCE(msk->pm.add_addr_accept_max); + info->mptcpi_add_addr_accepted_max = val; + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) + flags |= MPTCP_INFO_FLAG_FALLBACK; + if (READ_ONCE(msk->can_ack)) + flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; + info->mptcpi_flags = flags; + info->mptcpi_token = READ_ONCE(msk->token); + info->mptcpi_write_seq = READ_ONCE(msk->write_seq); + info->mptcpi_snd_una = atomic64_read(&msk->snd_una); + info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + unlock_sock_fast(sk, slow); +} + +static const struct inet_diag_handler mptcp_diag_handler = { + .dump = mptcp_diag_dump, + .dump_one = mptcp_diag_dump_one, + .idiag_get_info = mptcp_diag_get_info, + .idiag_type = IPPROTO_MPTCP, + .idiag_info_size = sizeof(struct mptcp_info), +}; + +static int __init mptcp_diag_init(void) +{ + return inet_diag_register(&mptcp_diag_handler); +} + +static void __exit mptcp_diag_exit(void) +{ + inet_diag_unregister(&mptcp_diag_handler); +} + +module_init(mptcp_diag_init); +module_exit(mptcp_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8f940be42f98..7fa822b55c34 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -451,6 +451,8 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_ext *ext) { + u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq); + if (!ext->use_map || !skb->len) { /* RFC6824 requires a DSS mapping with specific values * if DATA_FIN is set but no data payload is mapped @@ -458,10 +460,13 @@ static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, ext->data_fin = 1; ext->use_map = 1; ext->dsn64 = 1; - ext->data_seq = subflow->data_fin_tx_seq; + /* The write_seq value has already been incremented, so + * the actual sequence number for the DATA_FIN is one less. + */ + ext->data_seq = data_fin_tx_seq - 1; ext->subflow_seq = 0; ext->data_len = 1; - } else if (ext->data_seq + ext->data_len == subflow->data_fin_tx_seq) { + } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) { /* If there's an existing DSS mapping and it is the * final mapping, DATA_FIN consumes 1 additional byte of * mapping space. @@ -477,22 +482,17 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int dss_size = 0; + u64 snd_data_fin_enable; struct mptcp_ext *mpext; - struct mptcp_sock *msk; unsigned int ack_size; bool ret = false; - u8 tcp_fin; - if (skb) { - mpext = mptcp_get_ext(skb); - tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; - } else { - mpext = NULL; - tcp_fin = 0; - } + mpext = skb ? mptcp_get_ext(skb) : NULL; + snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable); - if (!skb || (mpext && mpext->use_map) || tcp_fin) { + if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size; map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; @@ -502,7 +502,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, if (mpext) opts->ext_copy = *mpext; - if (skb && tcp_fin && subflow->data_fin_tx_enable) + if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); ret = true; } @@ -511,7 +511,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, * if the first subflow may have the already the remote key handy */ opts->ext_copy.use_ack = 0; - msk = mptcp_sk(subflow->conn); if (!READ_ONCE(msk->can_ack)) { *size = ALIGN(dss_size, 4); return ret; @@ -624,6 +623,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, opts->suboptions = 0; + if (unlikely(mptcp_check_fallback(sk))) + return false; + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, @@ -706,6 +708,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, * additional ack. */ subflow->fully_established = 1; + WRITE_ONCE(msk->fully_established, true); goto fully_established; } @@ -714,15 +717,14 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, */ if (!mp_opt->mp_capable) { subflow->mp_capable = 0; - tcp_sk(sk)->is_mptcp = 0; + pr_fallback(msk); + __mptcp_do_fallback(msk); return false; } if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); - subflow->fully_established = 1; - subflow->remote_key = mp_opt->sndr_key; - subflow->can_ack = 1; + mptcp_subflow_fully_established(subflow, mp_opt); fully_established: if (likely(subflow->pm_notified)) @@ -780,6 +782,22 @@ static void update_una(struct mptcp_sock *msk, } } +bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq) +{ + /* Skip if DATA_FIN was already received. + * If updating simultaneously with the recvmsg loop, values + * should match. If they mismatch, the peer is misbehaving and + * we will prefer the most recent information. + */ + if (READ_ONCE(msk->rcv_data_fin) || !READ_ONCE(msk->first)) + return false; + + WRITE_ONCE(msk->rcv_data_fin_seq, data_fin_seq); + WRITE_ONCE(msk->rcv_data_fin, 1); + + return true; +} + static bool add_addr_hmac_valid(struct mptcp_sock *msk, struct mptcp_options_received *mp_opt) { @@ -814,6 +832,9 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; + if (__mptcp_check_fallback(msk)) + return; + mptcp_get_options(skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; @@ -847,6 +868,20 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, if (mp_opt.use_ack) update_una(msk, &mp_opt); + /* Zero-data-length packets are dropped by the caller and not + * propagated to the MPTCP layer, so the skb extension does not + * need to be allocated or populated. DATA_FIN information, if + * present, needs to be updated here before the skb is freed. + */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { + if (mp_opt.data_fin && mp_opt.data_len == 1 && + mptcp_update_rcv_data_fin(msk, mp_opt.data_seq) && + schedule_work(&msk->work)) + sock_hold(subflow->conn); + + return; + } + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 977d9c8b1453..a8ad20559aaa 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -10,8 +10,6 @@ #include <net/mptcp.h> #include "protocol.h" -static struct workqueue_struct *pm_wq; - /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, @@ -78,7 +76,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, return false; msk->pm.status |= BIT(new_status); - if (queue_work(pm_wq, &msk->pm.work)) + if (schedule_work(&msk->work)) sock_hold((struct sock *)msk); return true; } @@ -181,35 +179,6 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_get_local_id(msk, skc); } -static void pm_worker(struct work_struct *work) -{ - struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data, - work); - struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm); - struct sock *sk = (struct sock *)msk; - - lock_sock(sk); - spin_lock_bh(&msk->pm.lock); - - pr_debug("msk=%p status=%x", msk, pm->status); - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); - mptcp_pm_nl_add_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); - mptcp_pm_nl_fully_established(msk); - } - if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); - mptcp_pm_nl_subflow_established(msk); - } - - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); - sock_put(sk); -} - void mptcp_pm_data_init(struct mptcp_sock *msk) { msk->pm.add_addr_signaled = 0; @@ -223,22 +192,11 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.status = 0; spin_lock_init(&msk->pm.lock); - INIT_WORK(&msk->pm.work, pm_worker); mptcp_pm_nl_data_init(msk); } -void mptcp_pm_close(struct mptcp_sock *msk) -{ - if (cancel_work_sync(&msk->pm.work)) - sock_put((struct sock *)msk); -} - -void mptcp_pm_init(void) +void __init mptcp_pm_init(void) { - pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); - if (!pm_wq) - panic("Failed to allocate workqueue"); - mptcp_pm_nl_init(); } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index b78edf237ba0..c8820c4156e6 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -851,7 +851,7 @@ static struct pernet_operations mptcp_pm_pernet_ops = { .size = sizeof(struct pm_nl_pernet), }; -void mptcp_pm_nl_init(void) +void __init mptcp_pm_nl_init(void) { if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) panic("Failed to register MPTCP PM pernet subsystem.\n"); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c0abe738e7d3..8c1d1a595701 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -16,6 +16,7 @@ #include <net/inet_hashtables.h> #include <net/protocol.h> #include <net/tcp.h> +#include <net/tcp_states.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/transp_v6.h> #endif @@ -52,18 +53,10 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) return msk->subflow; } -static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) -{ - return msk->first && !sk_is_mptcp(msk->first); -} - -static struct socket *mptcp_is_tcpsk(struct sock *sk) +static bool mptcp_is_tcpsk(struct sock *sk) { struct socket *sock = sk->sk_socket; - if (sock->sk != sk) - return NULL; - if (unlikely(sk->sk_prot == &tcp_prot)) { /* we are being invoked after mptcp_accept() has * accepted a non-mp-capable flow: sk is a tcp_sk, @@ -73,59 +66,37 @@ static struct socket *mptcp_is_tcpsk(struct sock *sk) * bypass mptcp. */ sock->ops = &inet_stream_ops; - return sock; + return true; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { sock->ops = &inet6_stream_ops; - return sock; + return true; #endif } - return NULL; + return false; } -static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) +static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { - struct socket *sock; - sock_owned_by_me((const struct sock *)msk); - sock = mptcp_is_tcpsk((struct sock *)msk); - if (unlikely(sock)) - return sock; - - if (likely(!__mptcp_needs_tcp_fallback(msk))) + if (likely(!__mptcp_check_fallback(msk))) return NULL; - return msk->subflow; -} - -static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) -{ - return !msk->first; + return msk->first; } -static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) +static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct socket *ssock; int err; - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) - return ssock; - - ssock = __mptcp_nmpc_socket(msk); - if (ssock) - goto set_state; - - if (!__mptcp_can_create_subflow(msk)) - return ERR_PTR(-EINVAL); - err = mptcp_subflow_create_socket(sk, &ssock); if (err) - return ERR_PTR(err); + return err; msk->first = ssock->sk; msk->subflow = ssock; @@ -133,10 +104,12 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) list_add(&subflow->node, &msk->conn_list); subflow->request_mptcp = 1; -set_state: - if (state != MPTCP_SAME_STATE) - inet_sk_state_store(sk, state); - return ssock; + /* accept() will wait on first subflow sk_wq, and we always wakes up + * via msk->sk_socket + */ + RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq); + + return 0; } static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -170,6 +143,14 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->offset = offset; } +static void mptcp_stop_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + mptcp_sk(sk)->timer_ival = 0; +} + /* both sockets must be locked */ static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, struct sock *ssk) @@ -191,6 +172,139 @@ static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, return mptcp_subflow_data_available(ssk); } +static void mptcp_check_data_fin_ack(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (__mptcp_check_fallback(msk)) + return; + + /* Look for an acknowledged DATA_FIN */ + if (((1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && + msk->write_seq == atomic64_read(&msk->snd_una)) { + mptcp_stop_timer(sk); + + WRITE_ONCE(msk->snd_data_fin_enable, 0); + + switch (sk->sk_state) { + case TCP_FIN_WAIT1: + inet_sk_state_store(sk, TCP_FIN_WAIT2); + sk->sk_state_change(sk); + break; + case TCP_CLOSING: + fallthrough; + case TCP_LAST_ACK: + inet_sk_state_store(sk, TCP_CLOSE); + sk->sk_state_change(sk); + break; + } + + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + } +} + +static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (READ_ONCE(msk->rcv_data_fin) && + ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { + u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); + + if (msk->ack_seq == rcv_data_fin_seq) { + if (seq) + *seq = rcv_data_fin_seq; + + return true; + } + } + + return false; +} + +static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) +{ + long tout = ssk && inet_csk(ssk)->icsk_pending ? + inet_csk(ssk)->icsk_timeout - jiffies : 0; + + if (tout <= 0) + tout = mptcp_sk(sk)->timer_ival; + mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; +} + +static void mptcp_check_data_fin(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + u64 rcv_data_fin_seq; + + if (__mptcp_check_fallback(msk) || !msk->first) + return; + + /* Need to ack a DATA_FIN received from a peer while this side + * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. + * msk->rcv_data_fin was set when parsing the incoming options + * at the subflow level and the msk lock was not held, so this + * is the first opportunity to act on the DATA_FIN and change + * the msk state. + * + * If we are caught up to the sequence number of the incoming + * DATA_FIN, send the DATA_ACK now and do state transition. If + * not caught up, do nothing and let the recv code send DATA_ACK + * when catching up. + */ + + if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { + struct mptcp_subflow_context *subflow; + + msk->ack_seq++; + WRITE_ONCE(msk->rcv_data_fin, 0); + + sk->sk_shutdown |= RCV_SHUTDOWN; + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + inet_sk_state_store(sk, TCP_CLOSE_WAIT); + break; + case TCP_FIN_WAIT1: + inet_sk_state_store(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + inet_sk_state_store(sk, TCP_CLOSE); + // @@ Close subflows now? + break; + default: + /* Other states not expected */ + WARN_ON_ONCE(1); + break; + } + + mptcp_set_timeout(sk, NULL); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + tcp_send_ack(ssk); + release_sock(ssk); + } + + sk->sk_state_change(sk); + + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + } +} + static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk, unsigned int *bytes) @@ -207,13 +321,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, return false; } - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf); - - if (rcvbuf > sk->sk_rcvbuf) - sk->sk_rcvbuf = rcvbuf; - } - tp = tcp_sk(ssk); do { u32 map_remaining, offset; @@ -229,6 +336,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, if (!skb) break; + if (__mptcp_check_fallback(msk)) { + /* if we are running under the workqueue, TCP could have + * collapsed skbs between dummy map creation and now + * be sure to adjust the size + */ + map_remaining = skb->len; + subflow->map_data_len = skb->len; + } + offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (fin) { @@ -265,6 +381,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, *bytes = moved; + /* If the moves have caught up with the DATA_FIN sequence number + * it's time to ack the DATA_FIN and change socket state, but + * this is not a good place to change state. Let the workqueue + * do it. + */ + if (mptcp_pending_data_fin(sk, NULL) && + schedule_work(&msk->work)) + sock_hold(sk); + return done; } @@ -329,16 +454,6 @@ static void __mptcp_flush_join_list(struct mptcp_sock *msk) spin_unlock_bh(&msk->join_list_lock); } -static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) -{ - long tout = ssk && inet_csk(ssk)->icsk_pending ? - inet_csk(ssk)->icsk_timeout - jiffies : 0; - - if (tout <= 0) - tout = mptcp_sk(sk)->timer_ival; - mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; -} - static bool mptcp_timer_pending(struct sock *sk) { return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); @@ -360,7 +475,8 @@ void mptcp_data_acked(struct sock *sk) { mptcp_reset_timer(sk); - if (!sk_stream_is_writeable(sk) && + if ((!sk_stream_is_writeable(sk) || + (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && schedule_work(&mptcp_sk(sk)->work)) sock_hold(sk); } @@ -395,14 +511,6 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) } } -static void mptcp_stop_timer(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); - mptcp_sk(sk)->timer_ival = 0; -} - static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { const struct sock *sk = (const struct sock *)msk; @@ -466,8 +574,15 @@ static void mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - u64 snd_una = atomic64_read(&msk->snd_una); bool cleaned = false; + u64 snd_una; + + /* on fallback we just need to ignore snd_una, as this is really + * plain TCP + */ + if (__mptcp_check_fallback(msk)) + atomic64_set(&msk->snd_una, msk->write_seq); + snd_una = atomic64_read(&msk->snd_una); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) @@ -479,15 +594,20 @@ static void mptcp_clean_una(struct sock *sk) dfrag = mptcp_rtx_head(sk); if (dfrag && after64(snd_una, dfrag->data_seq)) { - u64 delta = dfrag->data_seq + dfrag->data_len - snd_una; + u64 delta = snd_una - dfrag->data_seq; + + if (WARN_ON_ONCE(delta > dfrag->data_len)) + goto out; dfrag->data_seq += delta; + dfrag->offset += delta; dfrag->data_len -= delta; dfrag_uncharge(sk, delta); cleaned = true; } +out: if (cleaned) { sk_mem_reclaim_partial(sk); @@ -673,7 +793,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, out: if (!retransmission) pfrag->offset += frag_truesize; - *write_seq += ret; + WRITE_ONCE(*write_seq, *write_seq + ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; @@ -740,7 +860,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; - struct socket *ssock; size_t copied = 0; struct sock *ssk; bool tx_ok; @@ -759,19 +878,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; } -fallback: - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) { - release_sock(sk); - pr_debug("fallback passthrough"); - ret = sock_sendmsg(ssock, msg); - return ret >= 0 ? ret + copied : (copied ? copied : ret); - } - pfrag = sk_page_frag(sk); restart: mptcp_clean_una(sk); + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { + ret = -EPIPE; + goto out; + } + wait_for_sndbuf: __mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); @@ -819,17 +934,6 @@ wait_for_sndbuf: } break; } - if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { - /* Can happen for passive sockets: - * 3WHS negotiated MPTCP, but first packet after is - * plain TCP (e.g. due to middlebox filtering unknown - * options). - * - * Fall back to TCP. - */ - release_sock(ssk); - goto fallback; - } copied += ret; @@ -880,7 +984,6 @@ wait_for_sndbuf: mptcp_set_timeout(sk, ssk); if (copied) { - ret = copied; tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); @@ -893,7 +996,7 @@ wait_for_sndbuf: release_sock(ssk); out: release_sock(sk); - return ret; + return copied ? : ret; } static void mptcp_wait_data(struct sock *sk, long *timeo) @@ -949,6 +1052,100 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, return copied; } +/* receive buffer autotuning. See tcp_rcv_space_adjust for more information. + * + * Only difference: Use highest rtt estimate of the subflows in use. + */ +static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + u32 time, advmss = 1; + u64 rtt_us, mstamp; + + sock_owned_by_me(sk); + + if (copied <= 0) + return; + + msk->rcvq_space.copied += copied; + + mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); + time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); + + rtt_us = msk->rcvq_space.rtt_us; + if (rtt_us && time < (rtt_us >> 3)) + return; + + rtt_us = 0; + mptcp_for_each_subflow(msk, subflow) { + const struct tcp_sock *tp; + u64 sf_rtt_us; + u32 sf_advmss; + + tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); + + sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); + sf_advmss = READ_ONCE(tp->advmss); + + rtt_us = max(sf_rtt_us, rtt_us); + advmss = max(sf_advmss, advmss); + } + + msk->rcvq_space.rtt_us = rtt_us; + if (time < (rtt_us >> 3) || rtt_us == 0) + return; + + if (msk->rcvq_space.copied <= msk->rcvq_space.space) + goto new_measure; + + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int rcvmem, rcvbuf; + u64 rcvwin, grow; + + rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; + + grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); + + do_div(grow, msk->rcvq_space.space); + rcvwin += (grow << 1); + + rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); + while (tcp_win_from_space(sk, rcvmem) < advmss) + rcvmem += 128; + + do_div(rcvwin, advmss); + rcvbuf = min_t(u64, rcvwin * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + + if (rcvbuf > sk->sk_rcvbuf) { + u32 window_clamp; + + window_clamp = tcp_win_from_space(sk, rcvbuf); + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + + /* Make subflows follow along. If we do not do this, we + * get drops at subflow level if skbs can't be moved to + * the mptcp rx queue fast enough (announced rcv_win can + * exceed ssk->sk_rcvbuf). + */ + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk; + + ssk = mptcp_subflow_tcp_sock(subflow); + WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); + tcp_sk(ssk)->window_clamp = window_clamp; + } + } + } + + msk->rcvq_space.space = msk->rcvq_space.copied; +new_measure: + msk->rcvq_space.copied = 0; + msk->rcvq_space.time = mstamp; +} + static bool __mptcp_move_skbs(struct mptcp_sock *msk) { unsigned int moved = 0; @@ -972,7 +1169,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; int copied = 0; int target; long timeo; @@ -981,16 +1177,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return -EOPNOTSUPP; lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) { -fallback: - release_sock(sk); - pr_debug("fallback-read subflow=%p", - mptcp_subflow_ctx(ssock->sk)); - copied = sock_recvmsg(ssock, msg, flags); - return copied; - } - timeo = sock_rcvtimeo(sk, nonblock); len = min_t(size_t, len, INT_MAX); @@ -1056,9 +1242,6 @@ fallback: pr_debug("block timeout %ld", timeo); mptcp_wait_data(sk, &timeo); - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) - goto fallback; } if (skb_queue_empty(&sk->sk_receive_queue)) { @@ -1075,6 +1258,8 @@ fallback: set_bit(MPTCP_DATA_READY, &msk->flags); } out_err: + mptcp_rcv_space_adjust(msk, copied); + release_sock(sk); return copied; } @@ -1083,7 +1268,7 @@ static void mptcp_retransmit_handler(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (atomic64_read(&msk->snd_una) == msk->write_seq) { + if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->write_seq)) { mptcp_stop_timer(sk); } else { set_bit(MPTCP_WORK_RTX, &msk->flags); @@ -1172,6 +1357,29 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } +static void pm_work(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + + spin_lock_bh(&msk->pm.lock); + + pr_debug("msk=%p status=%x", msk, pm->status); + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + mptcp_pm_nl_add_addr_received(msk); + } + if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + mptcp_pm_nl_fully_established(msk); + } + if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + mptcp_pm_nl_subflow_established(msk); + } + + spin_unlock_bh(&msk->pm.lock); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1185,12 +1393,18 @@ static void mptcp_worker(struct work_struct *work) lock_sock(sk); mptcp_clean_una(sk); + mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); __mptcp_move_skbs(msk); + if (msk->pm.status) + pm_work(msk); + if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); + mptcp_check_data_fin(sk); + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; @@ -1283,7 +1497,12 @@ static int mptcp_init_sock(struct sock *sk) if (ret) return ret; + ret = __mptcp_socket_create(mptcp_sk(sk)); + if (ret) + return ret; + sk_sockets_allocated_inc(sk); + sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; return 0; @@ -1308,8 +1527,7 @@ static void mptcp_cancel_work(struct sock *sk) sock_put(sk); } -static void mptcp_subflow_shutdown(struct sock *ssk, int how, - bool data_fin_tx_enable, u64 data_fin_tx_seq) +static void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) { lock_sock(ssk); @@ -1322,36 +1540,84 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how, tcp_disconnect(ssk, O_NONBLOCK); break; default: - if (data_fin_tx_enable) { - struct mptcp_subflow_context *subflow; - - subflow = mptcp_subflow_ctx(ssk); - subflow->data_fin_tx_seq = data_fin_tx_seq; - subflow->data_fin_tx_enable = 1; + if (__mptcp_check_fallback(mptcp_sk(sk))) { + pr_debug("Fallback"); + ssk->sk_shutdown |= how; + tcp_shutdown(ssk, how); + } else { + pr_debug("Sending DATA_FIN on subflow %p", ssk); + mptcp_set_timeout(sk, ssk); + tcp_send_ack(ssk); } - - ssk->sk_shutdown |= how; - tcp_shutdown(ssk, how); break; } - /* Wake up anyone sleeping in poll. */ - ssk->sk_state_change(ssk); release_sock(ssk); } -/* Called with msk lock held, releases such lock before returning */ +static const unsigned char new_state[16] = { + /* current state: new state: action: */ + [0 /* (Invalid) */] = TCP_CLOSE, + [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_SYN_SENT] = TCP_CLOSE, + [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, + [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, + [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ + [TCP_CLOSE] = TCP_CLOSE, + [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, + [TCP_LAST_ACK] = TCP_LAST_ACK, + [TCP_LISTEN] = TCP_CLOSE, + [TCP_CLOSING] = TCP_CLOSING, + [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ +}; + +static int mptcp_close_state(struct sock *sk) +{ + int next = (int)new_state[sk->sk_state]; + int ns = next & TCP_STATE_MASK; + + inet_sk_state_store(sk, ns); + + return next & TCP_ACTION_FIN; +} + static void mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); LIST_HEAD(conn_list); - u64 data_fin_tx_seq; lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + if (sk->sk_state == TCP_LISTEN) { + inet_sk_state_store(sk, TCP_CLOSE); + goto cleanup; + } else if (sk->sk_state == TCP_CLOSE) { + goto cleanup; + } + + if (__mptcp_check_fallback(msk)) { + goto update_state; + } else if (mptcp_close_state(sk)) { + pr_debug("Sending DATA_FIN sk=%p", sk); + WRITE_ONCE(msk->write_seq, msk->write_seq + 1); + WRITE_ONCE(msk->snd_data_fin_enable, 1); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK); + } + } + + sk_stream_wait_close(sk, timeout); +update_state: inet_sk_state_store(sk, TCP_CLOSE); +cleanup: /* be sure to always acquire the join list lock, to sync vs * mptcp_finish_join(). */ @@ -1360,22 +1626,16 @@ static void mptcp_close(struct sock *sk, long timeout) spin_unlock_bh(&msk->join_list_lock); list_splice_init(&msk->conn_list, &conn_list); - data_fin_tx_seq = msk->write_seq; - __mptcp_clear_xmit(sk); release_sock(sk); list_for_each_entry_safe(subflow, tmp, &conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - subflow->data_fin_tx_seq = data_fin_tx_seq; - subflow->data_fin_tx_enable = 1; __mptcp_close_ssk(sk, ssk, subflow, timeout); } mptcp_cancel_work(sk); - mptcp_pm_close(msk); __skb_queue_purge(&sk->sk_receive_queue); @@ -1447,20 +1707,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->local_key = subflow_req->local_key; msk->token = subflow_req->token; msk->subflow = NULL; - - if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) { - nsk->sk_state = TCP_CLOSE; - bh_unlock_sock(nsk); - - /* we can't call into mptcp_close() here - possible BH context - * free the sock directly. - * sk_clone_lock() sets nsk refcnt to two, hence call sk_free() - * too. - */ - sk_common_release(nsk); - sk_free(nsk); - return NULL; - } + WRITE_ONCE(msk->fully_established, false); msk->write_seq = subflow_req->idsn + 1; atomic64_set(&msk->snd_una, msk->write_seq); @@ -1482,6 +1729,22 @@ struct sock *mptcp_sk_clone(const struct sock *sk, return nsk; } +void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) +{ + const struct tcp_sock *tp = tcp_sk(ssk); + + msk->rcvq_space.copied = 0; + msk->rcvq_space.rtt_us = 0; + + msk->rcvq_space.time = tp->tcp_mstamp; + + /* initial rcv_space offering made to peer */ + msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, + TCP_INIT_CWND * tp->advmss); + if (msk->rcvq_space.space == 0) + msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; +} + static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, bool kern) { @@ -1501,7 +1764,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, return NULL; pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); - if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; @@ -1529,8 +1791,8 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, newsk = new_mptcp_sock; mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); - inet_sk_state_store(newsk, TCP_ESTABLISHED); + mptcp_rcv_space_init(msk, ssk); bh_unlock_sock(new_mptcp_sock); __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); @@ -1547,21 +1809,82 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - mptcp_token_destroy(msk->token); + mptcp_token_destroy(msk); if (msk->cached_ext) __skb_ext_put(msk->cached_ext); sk_sockets_allocated_dec(sk); } +static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int ret; + + switch (optname) { + case SO_REUSEPORT: + case SO_REUSEADDR: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); + if (ret == 0) { + if (optname == SO_REUSEPORT) + sk->sk_reuseport = ssock->sk->sk_reuseport; + else if (optname == SO_REUSEADDR) + sk->sk_reuse = ssock->sk->sk_reuse; + } + release_sock(sk); + return ret; + } + + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); +} + +static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + int ret = -EOPNOTSUPP; + struct socket *ssock; + + switch (optname) { + case IPV6_V6ONLY: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); + if (ret == 0) + sk->sk_ipv6only = ssock->sk->sk_ipv6only; + + release_sock(sk); + break; + } + + return ret; +} + static int mptcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; + struct sock *ssk; pr_debug("msk=%p", msk); + if (level == SOL_SOCKET) + return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); + /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow @@ -1569,11 +1892,13 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname, * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); + ssk = __mptcp_tcp_fallback(msk); release_sock(sk); - if (ssock) - return tcp_setsockopt(ssock->sk, level, optname, optval, - optlen); + if (ssk) + return tcp_setsockopt(ssk, level, optname, optval, optlen); + + if (level == SOL_IPV6) + return mptcp_setsockopt_v6(msk, optname, optval, optlen); return -EOPNOTSUPP; } @@ -1582,7 +1907,7 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; + struct sock *ssk; pr_debug("msk=%p", msk); @@ -1593,11 +1918,10 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); + ssk = __mptcp_tcp_fallback(msk); release_sock(sk); - if (ssock) - return tcp_getsockopt(ssock->sk, level, optname, optval, - option); + if (ssk) + return tcp_getsockopt(ssk, level, optname, optval, option); return -EOPNOTSUPP; } @@ -1636,6 +1960,20 @@ static void mptcp_release_cb(struct sock *sk) } } +static int mptcp_hash(struct sock *sk) +{ + /* should never be called, + * we hash the TCP subflows not the master socket + */ + WARN_ON_ONCE(1); + return 0; +} + +static void mptcp_unhash(struct sock *sk) +{ + /* called from sk_common_release(), but nothing to do here */ +} + static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1660,32 +1998,26 @@ void mptcp_finish_connect(struct sock *ssk) sk = subflow->conn; msk = mptcp_sk(sk); - if (!subflow->mp_capable) { - MPTCP_INC_STATS(sock_net(sk), - MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); - return; - } - pr_debug("msk=%p, token=%u", sk, subflow->token); mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); ack_seq++; subflow->map_seq = ack_seq; subflow->map_subflow_seq = 1; - subflow->rel_write_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race * accessing the field below */ WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); - WRITE_ONCE(msk->token, subflow->token); WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->can_ack, 1); atomic64_set(&msk->snd_una, msk->write_seq); mptcp_pm_new_connection(msk, 0); + + mptcp_rcv_space_init(msk, ssk); } static void mptcp_sock_graft(struct sock *sk, struct socket *parent) @@ -1708,7 +2040,7 @@ bool mptcp_finish_join(struct sock *sk) pr_debug("msk=%p, subflow=%p", msk, subflow); /* mptcp socket already closing? */ - if (inet_sk_state_load(parent) != TCP_ESTABLISHED) + if (!mptcp_is_fully_established(parent)) return false; if (!msk->pm.server_side) @@ -1761,8 +2093,8 @@ static struct proto mptcp_prot = { .sendmsg = mptcp_sendmsg, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, - .hash = inet_hash, - .unhash = inet_unhash, + .hash = mptcp_hash, + .unhash = mptcp_unhash, .get_port = mptcp_get_port, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, @@ -1771,6 +2103,7 @@ static struct proto mptcp_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), + .slab_flags = SLAB_TYPESAFE_BY_RCU, .no_autobind = true, }; @@ -1781,9 +2114,9 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int err; lock_sock(sock->sk); - ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } @@ -1796,10 +2129,18 @@ unlock: return err; } +static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + subflow->request_mptcp = 0; + __mptcp_do_fallback(msk); +} + static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct mptcp_subflow_context *subflow; struct socket *ssock; int err; @@ -1812,19 +2153,24 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto do_connect; } - ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } + mptcp_token_destroy(msk); + inet_sk_state_store(sock->sk, TCP_SYN_SENT); + subflow = mptcp_subflow_ctx(ssock->sk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) - mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; + mptcp_subflow_early_fallback(msk, subflow); #endif + if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) + mptcp_subflow_early_fallback(msk, subflow); do_connect: err = ssock->ops->connect(ssock, uaddr, addr_len, flags); @@ -1843,42 +2189,6 @@ unlock: return err; } -static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) -{ - if (sock->sk->sk_prot == &tcp_prot) { - /* we are being invoked from __sys_accept4, after - * mptcp_accept() has just accepted a non-mp-capable - * flow: sk is a tcp_sk, not an mptcp one. - * - * Hand the socket over to tcp so all further socket ops - * bypass mptcp. - */ - sock->ops = &inet_stream_ops; - } - - return inet_getname(sock, uaddr, peer); -} - -#if IS_ENABLED(CONFIG_MPTCP_IPV6) -static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) -{ - if (sock->sk->sk_prot == &tcpv6_prot) { - /* we are being invoked from __sys_accept4 after - * mptcp_accept() has accepted a non-mp-capable - * subflow: sk is a tcp_sk, not mptcp. - * - * Hand the socket over to tcp so all further - * socket ops bypass mptcp. - */ - sock->ops = &inet6_stream_ops; - } - - return inet6_getname(sock, uaddr, peer); -} -#endif - static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); @@ -1888,12 +2198,14 @@ static int mptcp_listen(struct socket *sock, int backlog) pr_debug("msk=%p", msk); lock_sock(sock->sk); - ssock = __mptcp_socket_create(msk, TCP_LISTEN); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + err = -EINVAL; goto unlock; } + mptcp_token_destroy(msk); + inet_sk_state_store(sock->sk, TCP_LISTEN); sock_set_flag(sock->sk, SOCK_RCU_FREE); err = ssock->ops->listen(ssock, backlog); @@ -1906,15 +2218,6 @@ unlock: return err; } -static bool is_tcp_proto(const struct proto *p) -{ -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - return p == &tcp_prot || p == &tcpv6_prot; -#else - return p == &tcp_prot; -#endif -} - static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { @@ -1932,11 +2235,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssock) goto unlock_fail; + clear_bit(MPTCP_DATA_READY, &msk->flags); sock_hold(ssock->sk); release_sock(sock->sk); err = ssock->ops->accept(sock, newsock, flags, kern); - if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) { + if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { struct mptcp_sock *msk = mptcp_sk(newsock->sk); struct mptcp_subflow_context *subflow; @@ -1944,7 +2248,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, * This is needed so NOSPACE flag can be set from tcp stack. */ __mptcp_flush_join_list(msk); - list_for_each_entry(subflow, &msk->conn_list, node) { + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!ssk->sk_socket) @@ -1952,6 +2256,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, } } + if (inet_csk_listen_poll(ssock->sk)) + set_bit(MPTCP_DATA_READY, &msk->flags); sock_put(ssock->sk); return err; @@ -1960,39 +2266,36 @@ unlock_fail: return -EINVAL; } +static __poll_t mptcp_check_readable(struct mptcp_sock *msk) +{ + return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : + 0; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct sock *sk = sock->sk; struct mptcp_sock *msk; - struct socket *ssock; __poll_t mask = 0; + int state; msk = mptcp_sk(sk); - lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); - if (!ssock) - ssock = __mptcp_nmpc_socket(msk); - if (ssock) { - mask = ssock->ops->poll(file, ssock, wait); - release_sock(sk); - return mask; - } - - release_sock(sk); sock_poll_wait(file, sock, wait); - lock_sock(sk); - if (test_bit(MPTCP_DATA_READY, &msk->flags)) - mask = EPOLLIN | EPOLLRDNORM; - if (sk_stream_is_writeable(sk) && - test_bit(MPTCP_SEND_SPACE, &msk->flags)) - mask |= EPOLLOUT | EPOLLWRNORM; + state = inet_sk_state_load(sk); + if (state == TCP_LISTEN) + return mptcp_check_readable(msk); + + if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { + mask |= mptcp_check_readable(msk); + if (sk_stream_is_writeable(sk) && + test_bit(MPTCP_SEND_SPACE, &msk->flags)) + mask |= EPOLLOUT | EPOLLWRNORM; + } if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - release_sock(sk); - return mask; } @@ -2000,23 +2303,13 @@ static int mptcp_shutdown(struct socket *sock, int how) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct mptcp_subflow_context *subflow; - struct socket *ssock; int ret = 0; pr_debug("sk=%p, how=%d", msk, how); lock_sock(sock->sk); - ssock = __mptcp_tcp_fallback(msk); - if (ssock) { - release_sock(sock->sk); - return inet_shutdown(ssock, how); - } - - if (how == SHUT_WR || how == SHUT_RDWR) - inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); how++; - if ((how & ~SHUTDOWN_MASK) || !how) { ret = -EINVAL; goto out_unlock; @@ -2030,13 +2323,36 @@ static int mptcp_shutdown(struct socket *sock, int how) sock->state = SS_CONNECTED; } - __mptcp_flush_join_list(msk); - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if (__mptcp_check_fallback(msk)) { + if (how == SHUT_WR || how == SHUT_RDWR) + inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); - mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq); + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(sock->sk, tcp_sk, how); + } + } else if ((how & SEND_SHUTDOWN) && + ((1 << sock->sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | + TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) && + mptcp_close_state(sock->sk)) { + __mptcp_flush_join_list(msk); + + WRITE_ONCE(msk->write_seq, msk->write_seq + 1); + WRITE_ONCE(msk->snd_data_fin_enable, 1); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(sock->sk, tcp_sk, how); + } } + /* Wake up anyone sleeping in poll. */ + sock->sk->sk_state_change(sock->sk); + out_unlock: release_sock(sock->sk); @@ -2051,7 +2367,7 @@ static const struct proto_ops mptcp_stream_ops = { .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, - .getname = mptcp_v4_getname, + .getname = inet_getname, .poll = mptcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, @@ -2063,10 +2379,6 @@ static const struct proto_ops mptcp_stream_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; static struct inet_protosw mptcp_protosw = { @@ -2077,7 +2389,7 @@ static struct inet_protosw mptcp_protosw = { .flags = INET_PROTOSW_ICSK, }; -void mptcp_proto_init(void) +void __init mptcp_proto_init(void) { mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; @@ -2086,6 +2398,7 @@ void mptcp_proto_init(void) mptcp_subflow_init(); mptcp_pm_init(); + mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); @@ -2104,7 +2417,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .connect = mptcp_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, - .getname = mptcp_v6_getname, + .getname = inet6_getname, .poll = mptcp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, @@ -2118,8 +2431,6 @@ static const struct proto_ops mptcp_v6_stream_ops = { .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; @@ -2139,7 +2450,7 @@ static struct inet_protosw mptcp_v6_protosw = { .flags = INET_PROTOSW_ICSK, }; -int mptcp_proto_v6_init(void) +int __init mptcp_proto_v6_init(void) { int err; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index c6eeaf3e8dcb..60b27d44c184 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -89,6 +89,7 @@ #define MPTCP_SEND_SPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 +#define MPTCP_FALLBACK_DONE 4 struct mptcp_options_received { u64 sndr_key; @@ -173,8 +174,6 @@ struct mptcp_pm_data { u8 local_addr_max; u8 subflows_max; u8 status; - - struct work_struct work; }; struct mptcp_data_frag { @@ -194,11 +193,15 @@ struct mptcp_sock { u64 remote_key; u64 write_seq; u64 ack_seq; + u64 rcv_data_fin_seq; atomic64_t snd_una; unsigned long timer_ival; u32 token; unsigned long flags; bool can_ack; + bool fully_established; + bool rcv_data_fin; + bool snd_data_fin_enable; spinlock_t join_list_lock; struct work_struct work; struct list_head conn_list; @@ -208,6 +211,12 @@ struct mptcp_sock { struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct sock *first; struct mptcp_pm_data pm; + struct { + u32 space; /* bytes copied in last measurement window */ + u32 copied; /* bytes copied in this measurement window */ + u64 time; /* start time of measurement window */ + u64 rtt_us; /* last maximum rtt of subflows */ + } rcvq_space; }; #define mptcp_for_each_subflow(__msk, __subflow) \ @@ -250,6 +259,7 @@ struct mptcp_subflow_request_sock { u32 local_nonce; u32 remote_nonce; struct mptcp_sock *msk; + struct hlist_nulls_node token_node; }; static inline struct mptcp_subflow_request_sock * @@ -284,10 +294,8 @@ struct mptcp_subflow_context { backup : 1, data_avail : 1, rx_eof : 1, - data_fin_tx_enable : 1, use_64bit_ack : 1, /* Set when we received a 64-bit DSN */ can_ack : 1; /* only after processing the remote a key */ - u64 data_fin_tx_seq; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -336,8 +344,10 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) } int mptcp_is_enabled(struct net *net); +void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, + struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); -void mptcp_subflow_init(void); +void __init mptcp_subflow_init(void); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, int ifindex, @@ -355,14 +365,9 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } -extern const struct inet_connection_sock_af_ops ipv4_specific; +void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) -extern const struct inet_connection_sock_af_ops ipv6_specific; -#endif - -void mptcp_proto_init(void); -#if IS_ENABLED(CONFIG_MPTCP_IPV6) -int mptcp_proto_v6_init(void); +int __init mptcp_proto_v6_init(void); #endif struct sock *mptcp_sk_clone(const struct sock *sk, @@ -372,36 +377,41 @@ void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); +static inline bool mptcp_is_fully_established(struct sock *sk) +{ + return inet_sk_state_load(sk) == TCP_ESTABLISHED && + READ_ONCE(mptcp_sk(sk)->fully_established); +} +void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); void mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); +bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq); + +void __init mptcp_token_init(void); +static inline void mptcp_token_init_request(struct request_sock *req) +{ + mptcp_subflow_rsk(req)->token_node.pprev = NULL; +} int mptcp_token_new_request(struct request_sock *req); -void mptcp_token_destroy_request(u32 token); +void mptcp_token_destroy_request(struct request_sock *req); int mptcp_token_new_connect(struct sock *sk); -int mptcp_token_new_accept(u32 token, struct sock *conn); +void mptcp_token_accept(struct mptcp_subflow_request_sock *r, + struct mptcp_sock *msk); +bool mptcp_token_exists(u32 token); struct mptcp_sock *mptcp_token_get_sock(u32 token); -void mptcp_token_destroy(u32 token); +struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, + long *s_num); +void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); -static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) -{ - /* we might consider a faster version that computes the key as a - * hash of some information available in the MPTCP socket. Use - * random data at the moment, as it's probably the safest option - * in case multiple sockets are opened in different namespaces at - * the same time. - */ - get_random_bytes(key, sizeof(u64)); - mptcp_crypto_key_sha(*key, token, idsn); -} void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); -void mptcp_pm_init(void); +void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); -void mptcp_pm_close(struct mptcp_sock *msk); void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); @@ -433,7 +443,7 @@ bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_addr_info *saddr); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -void mptcp_pm_nl_init(void); +void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_data_init(struct mptcp_sock *msk); void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); @@ -454,4 +464,66 @@ static inline bool before64(__u64 seq1, __u64 seq2) void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); +static inline bool __mptcp_check_fallback(struct mptcp_sock *msk) +{ + return test_bit(MPTCP_FALLBACK_DONE, &msk->flags); +} + +static inline bool mptcp_check_fallback(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + return __mptcp_check_fallback(msk); +} + +static inline void __mptcp_do_fallback(struct mptcp_sock *msk) +{ + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) { + pr_debug("TCP fallback already done (msk=%p)", msk); + return; + } + set_bit(MPTCP_FALLBACK_DONE, &msk->flags); +} + +static inline void mptcp_do_fallback(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + __mptcp_do_fallback(msk); +} + +#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) + +static inline bool subflow_simultaneous_connect(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + return sk->sk_state == TCP_ESTABLISHED && + !mptcp_sk(parent)->pm.server_side && + !subflow->conn_finished; +} + +#ifdef CONFIG_SYN_COOKIES +void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb); +bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb); +void __init mptcp_join_cookie_init(void); +#else +static inline void +subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) {} +static inline bool +mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) +{ + return false; +} + +static inline void mptcp_join_cookie_init(void) {} +#endif + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3838a0b3a21f..96f4f2fe50ad 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -29,40 +29,6 @@ static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); } -static int subflow_rebuild_header(struct sock *sk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - int local_id, err = 0; - - if (subflow->request_mptcp && !subflow->token) { - pr_debug("subflow=%p", sk); - err = mptcp_token_new_connect(sk); - } else if (subflow->request_join && !subflow->local_nonce) { - struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn; - - pr_debug("subflow=%p", sk); - - do { - get_random_bytes(&subflow->local_nonce, sizeof(u32)); - } while (!subflow->local_nonce); - - if (subflow->local_id) - goto out; - - local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); - if (local_id < 0) - return -EINVAL; - - subflow->local_id = local_id; - } - -out: - if (err) - return err; - - return subflow->icsk_af_ops->rebuild_header(sk); -} - static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); @@ -72,8 +38,7 @@ static void subflow_req_destructor(struct request_sock *req) if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); - if (subflow_req->mp_capable) - mptcp_token_destroy_request(subflow_req->token); + mptcp_token_destroy_request(req); tcp_request_sock_ops.destructor(req); } @@ -88,6 +53,12 @@ static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); } +static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) +{ + return mptcp_is_fully_established((void *)msk) && + READ_ONCE(msk->pm.accept_subflow); +} + /* validate received token and create truncated hmac and nonce for SYN-ACK */ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, const struct sk_buff *skb) @@ -120,30 +91,43 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, return msk; } -static void subflow_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { - struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - struct mptcp_options_received mp_opt; - - pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); - - mptcp_get_options(skb, &mp_opt); subflow_req->mp_capable = 0; subflow_req->mp_join = 0; subflow_req->msk = NULL; + mptcp_token_init_request(req); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) - return; + return -EINVAL; #endif + return 0; +} + +static void subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_options_received mp_opt; + int ret; + + pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + + ret = __subflow_init_req(req, sk_listener); + if (ret) + return; + + mptcp_get_options(skb, &mp_opt); + if (mp_opt.mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); @@ -154,13 +138,33 @@ static void subflow_init_req(struct request_sock *req, } if (mp_opt.mp_capable && listener->request_mptcp) { - int err; + int err, retries = 4; + + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; +again: + do { + get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); + } while (subflow_req->local_key == 0); + + if (unlikely(req->syncookie)) { + mptcp_crypto_key_sha(subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); + if (mptcp_token_exists(subflow_req->token)) { + if (retries-- > 0) + goto again; + } else { + subflow_req->mp_capable = 1; + } + return; + } err = mptcp_token_new_request(req); if (err == 0) subflow_req->mp_capable = 1; + else if (retries-- > 0) + goto again; - subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; } else if (mp_opt.mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; @@ -169,11 +173,60 @@ static void subflow_init_req(struct request_sock *req, subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req, skb); + + if (unlikely(req->syncookie) && subflow_req->msk) { + if (mptcp_can_accept_new_subflow(subflow_req->msk)) + subflow_init_req_cookie_join_save(subflow_req, skb); + } + pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } } +int mptcp_subflow_init_cookie_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_options_received mp_opt; + int err; + + err = __subflow_init_req(req, sk_listener); + if (err) + return err; + + mptcp_get_options(skb, &mp_opt); + + if (mp_opt.mp_capable && mp_opt.mp_join) + return -EINVAL; + + if (mp_opt.mp_capable && listener->request_mptcp) { + if (mp_opt.sndr_key == 0) + return -EINVAL; + + subflow_req->local_key = mp_opt.rcvr_key; + err = mptcp_token_new_request(req); + if (err) + return err; + + subflow_req->mp_capable = 1; + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; + } else if (mp_opt.mp_join && listener->request_mptcp) { + if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) + return -EINVAL; + + if (mptcp_can_accept_new_subflow(subflow_req->msk)) + subflow_req->mp_join = 1; + + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); + static void subflow_v4_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) @@ -222,7 +275,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; - struct tcp_sock *tp = tcp_sk(sk); subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); @@ -235,46 +287,40 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (subflow->conn_finished) return; + subflow->rel_write_seq = 1; subflow->conn_finished = 1; + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; + pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); - if (subflow->request_mptcp && mp_opt.mp_capable) { + if (subflow->request_mptcp) { + if (!mp_opt.mp_capable) { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); + mptcp_do_fallback(sk); + pr_fallback(mptcp_sk(subflow->conn)); + goto fallback; + } + subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = mp_opt.sndr_key; pr_debug("subflow=%p, remote_key=%llu", subflow, subflow->remote_key); - } else if (subflow->request_join && mp_opt.mp_join) { - subflow->mp_join = 1; + mptcp_finish_connect(sk); + } else if (subflow->request_join) { + u8 hmac[SHA256_DIGEST_SIZE]; + + if (!mp_opt.mp_join) + goto do_reset; + subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, subflow->thmac, subflow->remote_nonce); - } else if (subflow->request_mptcp) { - tp->is_mptcp = 0; - } - if (!tp->is_mptcp) - return; - - if (subflow->mp_capable) { - pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), - subflow->remote_key); - mptcp_finish_connect(sk); - - if (skb) { - pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); - subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - } - } else if (subflow->mp_join) { - u8 hmac[SHA256_DIGEST_SIZE]; - - pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", - subflow, subflow->thmac, - subflow->remote_nonce); if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); - subflow->mp_join = 0; goto do_reset; } @@ -282,24 +328,26 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->local_nonce, subflow->remote_nonce, hmac); - memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); - if (skb) - subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - if (!mptcp_finish_join(sk)) goto do_reset; + subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); - } else { -do_reset: - tcp_send_active_reset(sk, GFP_ATOMIC); - tcp_done(sk); + } else if (mptcp_check_fallback(sk)) { +fallback: + mptcp_rcv_space_init(mptcp_sk(parent), sk); } + return; + +do_reset: + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); } -static struct request_sock_ops subflow_request_sock_ops; +struct request_sock_ops mptcp_subflow_request_sock_ops; +EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops); static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -312,7 +360,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; - return tcp_conn_request(&subflow_request_sock_ops, + return tcp_conn_request(&mptcp_subflow_request_sock_ops, &subflow_request_sock_ipv4_ops, sk, skb); drop: @@ -337,7 +385,7 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!ipv6_unicast_destination(skb)) goto drop; - return tcp_conn_request(&subflow_request_sock_ops, + return tcp_conn_request(&mptcp_subflow_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); drop: @@ -386,7 +434,7 @@ static void mptcp_sock_destruct(struct sock *sk) sock_orphan(sk); } - mptcp_token_destroy(mptcp_sk(sk)->token); + mptcp_token_destroy(mptcp_sk(sk)); inet_sock_destruct(sk); } @@ -421,6 +469,17 @@ static void subflow_drop_ctx(struct sock *ssk) kfree_rcu(ctx, rcu); } +void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, + struct mptcp_options_received *mp_opt) +{ + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + subflow->remote_key = mp_opt->sndr_key; + subflow->fully_established = 1; + subflow->can_ack = 1; + WRITE_ONCE(msk->fully_established, true); +} + static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -444,7 +503,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, /* hopefully temporary handling for MP_JOIN+syncookie */ subflow_req = mptcp_subflow_rsk(req); - fallback_is_fatal = subflow_req->mp_join; + fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; fallback = !tcp_rsk(req)->is_mptcp; if (fallback) goto create_child; @@ -472,6 +531,7 @@ create_msk: } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); if (!mp_opt.mp_join || + !mptcp_can_accept_new_subflow(subflow_req->msk) || !subflow_hmac_valid(req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); fallback = true; @@ -500,20 +560,25 @@ create_child: } if (ctx->mp_capable) { + /* this can't race with mptcp_close(), as the msk is + * not yet exposted to user-space + */ + inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); + /* new mpc subflow takes ownership of the newly * created mptcp socket */ new_msk->sk_destruct = mptcp_sock_destruct; mptcp_pm_new_connection(mptcp_sk(new_msk), 1); + mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; new_msk = NULL; /* with OoO packets we can reach here without ingress * mpc option */ - ctx->remote_key = mp_opt.sndr_key; - ctx->fully_established = mp_opt.mp_capable; - ctx->can_ack = mp_opt.mp_capable; + if (mp_opt.mp_capable) + mptcp_subflow_fully_established(ctx, &mp_opt); } else if (ctx->mp_join) { struct mptcp_sock *owner; @@ -548,9 +613,9 @@ out: dispose_child: subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; - tcp_send_active_reset(child, GFP_ATOMIC); inet_csk_prepare_for_destroy_sock(child); tcp_done(child); + req->rsk_ops->send_reset(sk, skb); /* The last child reference will be released by the caller */ return child; @@ -562,7 +627,8 @@ enum mapping_status { MAPPING_OK, MAPPING_INVALID, MAPPING_EMPTY, - MAPPING_DATA_FIN + MAPPING_DATA_FIN, + MAPPING_DUMMY }; static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) @@ -614,7 +680,8 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) return true; } -static enum mapping_status get_mapping_status(struct sock *ssk) +static enum mapping_status get_mapping_status(struct sock *ssk, + struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_ext *mpext; @@ -626,6 +693,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk) if (!skb) return MAPPING_EMPTY; + if (mptcp_check_fallback(ssk)) + return MAPPING_DUMMY; + mpext = mptcp_get_ext(skb); if (!mpext || !mpext->use_map) { if (!subflow->map_valid && !skb->len) { @@ -661,7 +731,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk) if (mpext->data_fin == 1) { if (data_len == 1) { - pr_debug("DATA_FIN with no payload"); + mptcp_update_rcv_data_fin(msk, mpext->data_seq); + pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping @@ -673,6 +744,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk) } else { return MAPPING_DATA_FIN; } + } else { + mptcp_update_rcv_data_fin(msk, mpext->data_seq + data_len); + pr_debug("DATA_FIN with mapping seq=%llu", mpext->data_seq + data_len); } /* Adjust for DATA_FIN using 1 byte of sequence space */ @@ -761,12 +835,22 @@ static bool subflow_check_data_avail(struct sock *ssk) u64 ack_seq; u64 old_ack; - status = get_mapping_status(ssk); + status = get_mapping_status(ssk, msk); pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); if (status == MAPPING_INVALID) { ssk->sk_err = EBADMSG; goto fatal; } + if (status == MAPPING_DUMMY) { + __mptcp_do_fallback(msk); + skb = skb_peek(&ssk->sk_receive_queue); + subflow->map_valid = 1; + subflow->map_seq = READ_ONCE(msk->ack_seq); + subflow->map_data_len = skb->len; + subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - + subflow->ssn_offset; + return true; + } if (status != MAPPING_OK) return false; @@ -889,15 +973,20 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + u16 state = 1 << inet_sk_state_load(sk); struct sock *parent = subflow->conn; + struct mptcp_sock *msk; - if (!subflow->mp_capable && !subflow->mp_join) { - subflow->tcp_data_ready(sk); - + msk = mptcp_sk(parent); + if (state & TCPF_LISTEN) { + set_bit(MPTCP_DATA_READY, &msk->flags); parent->sk_data_ready(parent); return; } + WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && + !subflow->mp_join && !(state & TCPF_CLOSE)); + if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); } @@ -974,19 +1063,34 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; struct sockaddr_storage addr; + int local_id = loc->id; struct socket *sf; + struct sock *ssk; u32 remote_token; int addrlen; int err; - if (sk->sk_state != TCP_ESTABLISHED) + if (!mptcp_is_fully_established(sk)) return -ENOTCONN; err = mptcp_subflow_create_socket(sk, &sf); if (err) return err; - subflow = mptcp_subflow_ctx(sf->sk); + ssk = sf->sk; + subflow = mptcp_subflow_ctx(ssk); + do { + get_random_bytes(&subflow->local_nonce, sizeof(u32)); + } while (!subflow->local_nonce); + + if (!local_id) { + err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk); + if (err < 0) + goto failed; + + local_id = err; + } + subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; @@ -997,15 +1101,16 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex, if (loc->family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - sf->sk->sk_bound_dev_if = ifindex; + ssk->sk_bound_dev_if = ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) goto failed; mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); - pr_debug("msk=%p remote_token=%u", msk, remote_token); + pr_debug("msk=%p remote_token=%u local_id=%d", msk, remote_token, + local_id); subflow->remote_token = remote_token; - subflow->local_id = loc->id; + subflow->local_id = local_id; subflow->request_join = 1; subflow->request_bkup = 1; mptcp_info2sockaddr(remote, &addr); @@ -1032,6 +1137,12 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) struct socket *sf; int err; + /* un-accepted server sockets can reach here - on bad configuration + * bail early to avoid greater trouble later + */ + if (unlikely(!sk->sk_socket)) + return -EINVAL; + err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, &sf); if (err) @@ -1118,14 +1229,26 @@ static void subflow_state_change(struct sock *sk) __subflow_state_change(sk); + if (subflow_simultaneous_connect(sk)) { + mptcp_do_fallback(sk); + mptcp_rcv_space_init(mptcp_sk(parent), sk); + pr_fallback(mptcp_sk(parent)); + subflow->conn_finished = 1; + if (inet_sk_state_load(parent) == TCP_SYN_SENT) { + inet_sk_state_store(parent, TCP_ESTABLISHED); + parent->sk_state_change(parent); + } + } + /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. */ - if (subflow->mp_capable && mptcp_subflow_data_available(sk)) + if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); - if (!(parent->sk_shutdown & RCV_SHUTDOWN) && + if (__mptcp_check_fallback(mptcp_sk(parent)) && + !(parent->sk_shutdown & RCV_SHUTDOWN) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; mptcp_subflow_eof(parent); @@ -1255,10 +1378,10 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops) return 0; } -void mptcp_subflow_init(void) +void __init mptcp_subflow_init(void) { - subflow_request_sock_ops = tcp_request_sock_ops; - if (subflow_ops_init(&subflow_request_sock_ops) != 0) + mptcp_subflow_request_sock_ops = tcp_request_sock_ops; + if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0) panic("MPTCP: failed to init subflow request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; @@ -1268,7 +1391,6 @@ void mptcp_subflow_init(void) subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; - subflow_specific.rebuild_header = subflow_rebuild_header; #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; @@ -1278,7 +1400,6 @@ void mptcp_subflow_init(void) subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; - subflow_v6_specific.rebuild_header = subflow_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c new file mode 100644 index 000000000000..abe0fd099746 --- /dev/null +++ b/net/mptcp/syncookies.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/skbuff.h> + +#include "protocol.h" + +/* Syncookies do not work for JOIN requests. + * + * Unlike MP_CAPABLE, where the ACK cookie contains the needed MPTCP + * options to reconstruct the initial syn state, MP_JOIN does not contain + * the token to obtain the mptcp socket nor the server-generated nonce + * that was used in the cookie SYN/ACK response. + * + * Keep a small best effort state table to store the syn/synack data, + * indexed by skb hash. + * + * A MP_JOIN SYN packet handled by syn cookies is only stored if the 32bit + * token matches a known mptcp connection that can still accept more subflows. + * + * There is no timeout handling -- state is only re-constructed + * when the TCP ACK passed the cookie validation check. + */ + +struct join_entry { + u32 token; + u32 remote_nonce; + u32 local_nonce; + u8 join_id; + u8 local_id; + u8 backup; + u8 valid; +}; + +#define COOKIE_JOIN_SLOTS 1024 + +static struct join_entry join_entries[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp; +static spinlock_t join_entry_locks[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp; + +static u32 mptcp_join_entry_hash(struct sk_buff *skb, struct net *net) +{ + u32 i = skb_get_hash(skb) ^ net_hash_mix(net); + + return i % ARRAY_SIZE(join_entries); +} + +static void mptcp_join_store_state(struct join_entry *entry, + const struct mptcp_subflow_request_sock *subflow_req) +{ + entry->token = subflow_req->token; + entry->remote_nonce = subflow_req->remote_nonce; + entry->local_nonce = subflow_req->local_nonce; + entry->backup = subflow_req->backup; + entry->join_id = subflow_req->remote_id; + entry->local_id = subflow_req->local_id; + entry->valid = 1; +} + +void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) +{ + struct net *net = read_pnet(&subflow_req->sk.req.ireq_net); + u32 i = mptcp_join_entry_hash(skb, net); + + /* No use in waiting if other cpu is already using this slot -- + * would overwrite the data that got stored. + */ + spin_lock_bh(&join_entry_locks[i]); + mptcp_join_store_state(&join_entries[i], subflow_req); + spin_unlock_bh(&join_entry_locks[i]); +} + +/* Called for a cookie-ack with MP_JOIN option present. + * Look up the saved state based on skb hash & check token matches msk + * in same netns. + * + * Caller will check msk can still accept another subflow. The hmac + * present in the cookie ACK mptcp option space will be checked later. + */ +bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, + struct sk_buff *skb) +{ + struct net *net = read_pnet(&subflow_req->sk.req.ireq_net); + u32 i = mptcp_join_entry_hash(skb, net); + struct mptcp_sock *msk; + struct join_entry *e; + + e = &join_entries[i]; + + spin_lock_bh(&join_entry_locks[i]); + + if (e->valid == 0) { + spin_unlock_bh(&join_entry_locks[i]); + return false; + } + + e->valid = 0; + + msk = mptcp_token_get_sock(e->token); + if (!msk) { + spin_unlock_bh(&join_entry_locks[i]); + return false; + } + + /* If this fails, the token got re-used in the mean time by another + * mptcp socket in a different netns, i.e. entry is outdated. + */ + if (!net_eq(sock_net((struct sock *)msk), net)) + goto err_put; + + subflow_req->remote_nonce = e->remote_nonce; + subflow_req->local_nonce = e->local_nonce; + subflow_req->backup = e->backup; + subflow_req->remote_id = e->join_id; + subflow_req->token = e->token; + subflow_req->msk = msk; + spin_unlock_bh(&join_entry_locks[i]); + return true; + +err_put: + spin_unlock_bh(&join_entry_locks[i]); + sock_put((struct sock *)msk); + return false; +} + +void __init mptcp_join_cookie_init(void) +{ + int i; + + for (i = 0; i < COOKIE_JOIN_SLOTS; i++) + spin_lock_init(&join_entry_locks[i]); +} diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 33352dd99d4d..8b47c4bb1c6b 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -24,7 +24,7 @@ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/radix-tree.h> +#include <linux/memblock.h> #include <linux/ip.h> #include <linux/tcp.h> #include <net/sock.h> @@ -33,10 +33,67 @@ #include <net/mptcp.h> #include "protocol.h" -static RADIX_TREE(token_tree, GFP_ATOMIC); -static RADIX_TREE(token_req_tree, GFP_ATOMIC); -static DEFINE_SPINLOCK(token_tree_lock); -static int token_used __read_mostly; +#define TOKEN_MAX_RETRIES 4 +#define TOKEN_MAX_CHAIN_LEN 4 + +struct token_bucket { + spinlock_t lock; + int chain_len; + struct hlist_nulls_head req_chain; + struct hlist_nulls_head msk_chain; +}; + +static struct token_bucket *token_hash __read_mostly; +static unsigned int token_mask __read_mostly; + +static struct token_bucket *token_bucket(u32 token) +{ + return &token_hash[token & token_mask]; +} + +/* called with bucket lock held */ +static struct mptcp_subflow_request_sock * +__token_lookup_req(struct token_bucket *t, u32 token) +{ + struct mptcp_subflow_request_sock *req; + struct hlist_nulls_node *pos; + + hlist_nulls_for_each_entry_rcu(req, pos, &t->req_chain, token_node) + if (req->token == token) + return req; + return NULL; +} + +/* called with bucket lock held */ +static struct mptcp_sock * +__token_lookup_msk(struct token_bucket *t, u32 token) +{ + struct hlist_nulls_node *pos; + struct sock *sk; + + sk_nulls_for_each_rcu(sk, pos, &t->msk_chain) + if (mptcp_sk(sk)->token == token) + return mptcp_sk(sk); + return NULL; +} + +static bool __token_bucket_busy(struct token_bucket *t, u32 token) +{ + return !token || t->chain_len >= TOKEN_MAX_CHAIN_LEN || + __token_lookup_req(t, token) || __token_lookup_msk(t, token); +} + +static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) +{ + /* we might consider a faster version that computes the key as a + * hash of some information available in the MPTCP socket. Use + * random data at the moment, as it's probably the safest option + * in case multiple sockets are opened in different namespaces at + * the same time. + */ + get_random_bytes(key, sizeof(u64)); + mptcp_crypto_key_sha(*key, token, idsn); +} /** * mptcp_token_new_request - create new key/idsn/token for subflow_request @@ -52,30 +109,28 @@ static int token_used __read_mostly; int mptcp_token_new_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - int err; - - while (1) { - u32 token; - - mptcp_crypto_key_gen_sha(&subflow_req->local_key, - &subflow_req->token, - &subflow_req->idsn); - pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", - req, subflow_req->local_key, subflow_req->token, - subflow_req->idsn); - - token = subflow_req->token; - spin_lock_bh(&token_tree_lock); - if (!radix_tree_lookup(&token_req_tree, token) && - !radix_tree_lookup(&token_tree, token)) - break; - spin_unlock_bh(&token_tree_lock); + struct token_bucket *bucket; + u32 token; + + mptcp_crypto_key_sha(subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); + pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", + req, subflow_req->local_key, subflow_req->token, + subflow_req->idsn); + + token = subflow_req->token; + bucket = token_bucket(token); + spin_lock_bh(&bucket->lock); + if (__token_bucket_busy(bucket, token)) { + spin_unlock_bh(&bucket->lock); + return -EBUSY; } - err = radix_tree_insert(&token_req_tree, - subflow_req->token, &token_used); - spin_unlock_bh(&token_tree_lock); - return err; + hlist_nulls_add_head_rcu(&subflow_req->token_node, &bucket->req_chain); + bucket->chain_len++; + spin_unlock_bh(&bucket->lock); + return 0; } /** @@ -97,48 +152,82 @@ int mptcp_token_new_request(struct request_sock *req) int mptcp_token_new_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct sock *mptcp_sock = subflow->conn; - int err; - - while (1) { - u32 token; + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + int retries = TOKEN_MAX_RETRIES; + struct token_bucket *bucket; - mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, - &subflow->idsn); + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); - pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", - sk, subflow->local_key, subflow->token, subflow->idsn); +again: + mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, + &subflow->idsn); - token = subflow->token; - spin_lock_bh(&token_tree_lock); - if (!radix_tree_lookup(&token_req_tree, token) && - !radix_tree_lookup(&token_tree, token)) - break; - spin_unlock_bh(&token_tree_lock); + bucket = token_bucket(subflow->token); + spin_lock_bh(&bucket->lock); + if (__token_bucket_busy(bucket, subflow->token)) { + spin_unlock_bh(&bucket->lock); + if (!--retries) + return -EBUSY; + goto again; } - err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock); - spin_unlock_bh(&token_tree_lock); - return err; + WRITE_ONCE(msk->token, subflow->token); + __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); + bucket->chain_len++; + spin_unlock_bh(&bucket->lock); + return 0; } /** - * mptcp_token_new_accept - insert token for later processing - * @token: the token to insert to the tree - * @conn: the just cloned socket linked to the new connection + * mptcp_token_accept - replace a req sk with full sock in token hash + * @req: the request socket to be removed + * @msk: the just cloned socket linked to the new connection * * Called when a SYN packet creates a new logical connection, i.e. * is not a join request. */ -int mptcp_token_new_accept(u32 token, struct sock *conn) +void mptcp_token_accept(struct mptcp_subflow_request_sock *req, + struct mptcp_sock *msk) { - int err; + struct mptcp_subflow_request_sock *pos; + struct token_bucket *bucket; - spin_lock_bh(&token_tree_lock); - err = radix_tree_insert(&token_tree, token, conn); - spin_unlock_bh(&token_tree_lock); + bucket = token_bucket(req->token); + spin_lock_bh(&bucket->lock); - return err; + /* pedantic lookup check for the moved token */ + pos = __token_lookup_req(bucket, req->token); + if (!WARN_ON_ONCE(pos != req)) + hlist_nulls_del_init_rcu(&req->token_node); + __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); + spin_unlock_bh(&bucket->lock); +} + +bool mptcp_token_exists(u32 token) +{ + struct hlist_nulls_node *pos; + struct token_bucket *bucket; + struct mptcp_sock *msk; + struct sock *sk; + + rcu_read_lock(); + bucket = token_bucket(token); + +again: + sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { + msk = mptcp_sk(sk); + if (READ_ONCE(msk->token) == token) + goto found; + } + if (get_nulls_value(pos) != (token & token_mask)) + goto again; + + rcu_read_unlock(); + return false; +found: + rcu_read_unlock(); + return true; } /** @@ -152,45 +241,171 @@ int mptcp_token_new_accept(u32 token, struct sock *conn) */ struct mptcp_sock *mptcp_token_get_sock(u32 token) { - struct sock *conn; - - spin_lock_bh(&token_tree_lock); - conn = radix_tree_lookup(&token_tree, token); - if (conn) { - /* token still reserved? */ - if (conn == (struct sock *)&token_used) - conn = NULL; - else - sock_hold(conn); + struct hlist_nulls_node *pos; + struct token_bucket *bucket; + struct mptcp_sock *msk; + struct sock *sk; + + rcu_read_lock(); + bucket = token_bucket(token); + +again: + sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { + msk = mptcp_sk(sk); + if (READ_ONCE(msk->token) != token) + continue; + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto not_found; + if (READ_ONCE(msk->token) != token) { + sock_put(sk); + goto again; + } + goto found; + } + if (get_nulls_value(pos) != (token & token_mask)) + goto again; + +not_found: + msk = NULL; + +found: + rcu_read_unlock(); + return msk; +} +EXPORT_SYMBOL_GPL(mptcp_token_get_sock); + +/** + * mptcp_token_iter_next - iterate over the token container from given pos + * @net: namespace to be iterated + * @s_slot: start slot number + * @s_num: start number inside the given lock + * + * This function returns the first mptcp connection structure found inside the + * token container starting from the specified position, or NULL. + * + * On successful iteration, the iterator is move to the next position and the + * the acquires a reference to the returned socket. + */ +struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, + long *s_num) +{ + struct mptcp_sock *ret = NULL; + struct hlist_nulls_node *pos; + int slot, num; + + for (slot = *s_slot; slot <= token_mask; *s_num = 0, slot++) { + struct token_bucket *bucket = &token_hash[slot]; + struct sock *sk; + + num = 0; + + if (hlist_nulls_empty(&bucket->msk_chain)) + continue; + + rcu_read_lock(); + sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { + ++num; + if (!net_eq(sock_net(sk), net)) + continue; + + if (num <= *s_num) + continue; + + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + continue; + + if (!net_eq(sock_net(sk), net)) { + sock_put(sk); + continue; + } + + ret = mptcp_sk(sk); + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); } - spin_unlock_bh(&token_tree_lock); - return mptcp_sk(conn); +out: + *s_slot = slot; + *s_num = num; + return ret; } +EXPORT_SYMBOL_GPL(mptcp_token_iter_next); /** * mptcp_token_destroy_request - remove mptcp connection/token - * @token: token of mptcp connection to remove + * @req: mptcp request socket dropping the token * - * Remove not-yet-fully-established incoming connection identified - * by @token. + * Remove the token associated to @req. */ -void mptcp_token_destroy_request(u32 token) +void mptcp_token_destroy_request(struct request_sock *req) { - spin_lock_bh(&token_tree_lock); - radix_tree_delete(&token_req_tree, token); - spin_unlock_bh(&token_tree_lock); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_subflow_request_sock *pos; + struct token_bucket *bucket; + + if (hlist_nulls_unhashed(&subflow_req->token_node)) + return; + + bucket = token_bucket(subflow_req->token); + spin_lock_bh(&bucket->lock); + pos = __token_lookup_req(bucket, subflow_req->token); + if (!WARN_ON_ONCE(pos != subflow_req)) { + hlist_nulls_del_init_rcu(&pos->token_node); + bucket->chain_len--; + } + spin_unlock_bh(&bucket->lock); } /** * mptcp_token_destroy - remove mptcp connection/token - * @token: token of mptcp connection to remove + * @msk: mptcp connection dropping the token * - * Remove the connection identified by @token. + * Remove the token associated to @msk */ -void mptcp_token_destroy(u32 token) +void mptcp_token_destroy(struct mptcp_sock *msk) { - spin_lock_bh(&token_tree_lock); - radix_tree_delete(&token_tree, token); - spin_unlock_bh(&token_tree_lock); + struct token_bucket *bucket; + struct mptcp_sock *pos; + + if (sk_unhashed((struct sock *)msk)) + return; + + bucket = token_bucket(msk->token); + spin_lock_bh(&bucket->lock); + pos = __token_lookup_msk(bucket, msk->token); + if (!WARN_ON_ONCE(pos != msk)) { + __sk_nulls_del_node_init_rcu((struct sock *)pos); + bucket->chain_len--; + } + spin_unlock_bh(&bucket->lock); } + +void __init mptcp_token_init(void) +{ + int i; + + token_hash = alloc_large_system_hash("MPTCP token", + sizeof(struct token_bucket), + 0, + 20,/* one slot per 1MB of memory */ + HASH_ZERO, + NULL, + &token_mask, + 0, + 64 * 1024); + for (i = 0; i < token_mask + 1; ++i) { + INIT_HLIST_NULLS_HEAD(&token_hash[i].req_chain, i); + INIT_HLIST_NULLS_HEAD(&token_hash[i].msk_chain, i); + spin_lock_init(&token_hash[i].lock); + } +} + +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +EXPORT_SYMBOL_GPL(mptcp_token_new_request); +EXPORT_SYMBOL_GPL(mptcp_token_new_connect); +EXPORT_SYMBOL_GPL(mptcp_token_accept); +EXPORT_SYMBOL_GPL(mptcp_token_destroy_request); +EXPORT_SYMBOL_GPL(mptcp_token_destroy); +#endif diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c new file mode 100644 index 000000000000..e1bd6f0a0676 --- /dev/null +++ b/net/mptcp/token_test.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <kunit/test.h> + +#include "protocol.h" + +static struct mptcp_subflow_request_sock *build_req_sock(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req; + + req = kunit_kzalloc(test, sizeof(struct mptcp_subflow_request_sock), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, req); + mptcp_token_init_request((struct request_sock *)req); + return req; +} + +static void mptcp_token_test_req_basic(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *null_msk = NULL; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + KUNIT_EXPECT_NE(test, 0, (int)req->token); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(req->token)); + + /* cleanup */ + mptcp_token_destroy_request((struct request_sock *)req); +} + +static struct inet_connection_sock *build_icsk(struct kunit *test) +{ + struct inet_connection_sock *icsk; + + icsk = kunit_kzalloc(test, sizeof(struct inet_connection_sock), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, icsk); + return icsk; +} + +static struct mptcp_subflow_context *build_ctx(struct kunit *test) +{ + struct mptcp_subflow_context *ctx; + + ctx = kunit_kzalloc(test, sizeof(struct mptcp_subflow_context), + GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, ctx); + return ctx; +} + +static struct mptcp_sock *build_msk(struct kunit *test) +{ + struct mptcp_sock *msk; + + msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); + refcount_set(&((struct sock *)msk)->sk_refcnt, 1); + return msk; +} + +static void mptcp_token_test_msk_basic(struct kunit *test) +{ + struct inet_connection_sock *icsk = build_icsk(test); + struct mptcp_subflow_context *ctx = build_ctx(test); + struct mptcp_sock *msk = build_msk(test); + struct mptcp_sock *null_msk = NULL; + struct sock *sk; + + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + ctx->conn = (struct sock *)msk; + sk = (struct sock *)msk; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_connect((struct sock *)icsk)); + KUNIT_EXPECT_NE(test, 0, (int)ctx->token); + KUNIT_EXPECT_EQ(test, ctx->token, msk->token); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_EQ(test, 2, (int)refcount_read(&sk->sk_refcnt)); + + mptcp_token_destroy(msk); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(ctx->token)); +} + +static void mptcp_token_test_accept(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *msk = build_msk(test); + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + msk->token = req->token; + mptcp_token_accept(req, msk); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + + /* this is now a no-op */ + mptcp_token_destroy_request((struct request_sock *)req); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + + /* cleanup */ + mptcp_token_destroy(msk); +} + +static void mptcp_token_test_destroyed(struct kunit *test) +{ + struct mptcp_subflow_request_sock *req = build_req_sock(test); + struct mptcp_sock *msk = build_msk(test); + struct mptcp_sock *null_msk = NULL; + struct sock *sk; + + sk = (struct sock *)msk; + + KUNIT_ASSERT_EQ(test, 0, + mptcp_token_new_request((struct request_sock *)req)); + msk->token = req->token; + mptcp_token_accept(req, msk); + + /* simulate race on removal */ + refcount_set(&sk->sk_refcnt, 0); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(msk->token)); + + /* cleanup */ + mptcp_token_destroy(msk); +} + +static struct kunit_case mptcp_token_test_cases[] = { + KUNIT_CASE(mptcp_token_test_req_basic), + KUNIT_CASE(mptcp_token_test_msk_basic), + KUNIT_CASE(mptcp_token_test_accept), + KUNIT_CASE(mptcp_token_test_destroyed), + {} +}; + +static struct kunit_suite mptcp_token_suite = { + .name = "mptcp-token", + .test_cases = mptcp_token_test_cases, +}; + +kunit_test_suite(mptcp_token_suite); + +MODULE_LICENSE("GPL"); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index a94bb59793f0..5b1f4ec66dd9 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -471,7 +471,7 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr) memcpy(&ncf->addrs[index], cmd->mac, ETH_ALEN); } else { clear_bit(cmd->index - 1, bitmap); - memset(&ncf->addrs[index], 0, ETH_ALEN); + eth_zero_addr(&ncf->addrs[index]); } spin_unlock_irqrestore(&nc->lock, flags); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0ffe2b8723c4..25313c29d799 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -447,7 +447,7 @@ config NF_TABLES replace the existing {ip,ip6,arp,eb}_tables infrastructure. It provides a pseudo-state machine with an extensible instruction-set (also known as expressions) that the userspace 'nft' utility - (http://www.netfilter.org/projects/nftables) uses to build the + (https://www.netfilter.org/projects/nftables) uses to build the rule-set. It also comes with the generic set infrastructure that allows you to construct mappings between matchings and actions for performance lookups. diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 56621d6bfd29..920b7c4331f0 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1644,7 +1644,7 @@ dump_last: goto next_set; if (set->variant->uref) set->variant->uref(set, cb, true); - /* fall through */ + fallthrough; default: ret = set->variant->list(set, skb, cb); if (!cb->args[IPSET_CB_ARG0]) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 02f2f636798d..a90b8eac16ac 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -807,6 +807,31 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) kmem_cache_free(ip_vs_conn_cachep, cp); } +/* Try to delete connection while not holding reference */ +static void ip_vs_conn_del(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) { + /* Drop cp->control chain too */ + if (cp->control) + cp->timeout = 0; + ip_vs_conn_expire(&cp->timer); + } +} + +/* Try to delete connection while holding reference */ +static void ip_vs_conn_del_put(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) { + /* Drop cp->control chain too */ + if (cp->control) + cp->timeout = 0; + __ip_vs_conn_put(cp); + ip_vs_conn_expire(&cp->timer); + } else { + __ip_vs_conn_put(cp); + } +} + static void ip_vs_conn_expire(struct timer_list *t) { struct ip_vs_conn *cp = from_timer(cp, t, timer); @@ -827,14 +852,17 @@ static void ip_vs_conn_expire(struct timer_list *t) /* does anybody control me? */ if (ct) { + bool has_ref = !cp->timeout && __ip_vs_conn_get(ct); + ip_vs_control_del(cp); /* Drop CTL or non-assured TPL if not used anymore */ - if (!cp->timeout && !atomic_read(&ct->n_control) && + if (has_ref && !atomic_read(&ct->n_control) && (!(ct->flags & IP_VS_CONN_F_TEMPLATE) || !(ct->state & IP_VS_CTPL_S_ASSURED))) { IP_VS_DBG(4, "drop controlling connection\n"); - ct->timeout = 0; - ip_vs_conn_expire_now(ct); + ip_vs_conn_del_put(ct); + } else if (has_ref) { + __ip_vs_conn_put(ct); } } @@ -1317,8 +1345,7 @@ try_drop: drop: IP_VS_DBG(4, "drop connection\n"); - cp->timeout = 0; - ip_vs_conn_expire_now(cp); + ip_vs_conn_del(cp); } cond_resched_rcu(); } @@ -1341,19 +1368,15 @@ flush_again: hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { if (cp->ipvs != ipvs) continue; - /* As timers are expired in LIFO order, restart - * the timer of controlling connection first, so - * that it is expired after us. - */ + if (atomic_read(&cp->n_control)) + continue; cp_c = cp->control; - /* cp->control is valid only with reference to cp */ - if (cp_c && __ip_vs_conn_get(cp)) { + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_del(cp); + if (cp_c && !atomic_read(&cp_c->n_control)) { IP_VS_DBG(4, "del controlling connection\n"); - ip_vs_conn_expire_now(cp_c); - __ip_vs_conn_put(cp); + ip_vs_conn_del(cp_c); } - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); } cond_resched_rcu(); } @@ -1366,6 +1389,45 @@ flush_again: goto flush_again; } } + +#ifdef CONFIG_SYSCTL +void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) +{ + int idx; + struct ip_vs_conn *cp, *cp_c; + struct ip_vs_dest *dest; + + rcu_read_lock(); + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + if (cp->ipvs != ipvs) + continue; + + dest = cp->dest; + if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE)) + continue; + + if (atomic_read(&cp->n_control)) + continue; + + cp_c = cp->control; + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_del(cp); + if (cp_c && !atomic_read(&cp_c->n_control)) { + IP_VS_DBG(4, "del controlling connection\n"); + ip_vs_conn_del(cp_c); + } + } + cond_resched_rcu(); + + /* netns clean up started, abort delayed work */ + if (!ipvs->enable) + break; + } + rcu_read_unlock(); +} +#endif + /* * per netns init and exit */ diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index aa6a603a2425..e3668a6e54e4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -694,16 +694,10 @@ static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) return ipvs->sysctl_nat_icmp_send; } -static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) -{ - return ipvs->sysctl_expire_nodest_conn; -} - #else static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } -static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } #endif @@ -2066,14 +2060,14 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { - bool uses_ct = false, resched = false; + bool old_ct = false, resched = false; if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && unlikely(!atomic_read(&cp->dest->weight))) { resched = true; - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); } else if (is_new_conn_expected(cp, conn_reuse_mode)) { - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); if (!atomic_read(&cp->n_control)) { resched = true; } else { @@ -2081,50 +2075,51 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int * that uses conntrack while it is still * referenced by controlled connection(s). */ - resched = !uses_ct; + resched = !old_ct; } } if (resched) { + if (!old_ct) + cp->flags &= ~IP_VS_CONN_F_NFCT; if (!atomic_read(&cp->n_control)) ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); - if (uses_ct) + if (old_ct) return NF_DROP; cp = NULL; } } - if (unlikely(!cp)) { - int v; - - if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) - return v; - } - - IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); - /* Check the server status */ - if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ + if (sysctl_expire_nodest_conn(ipvs)) { + bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); - __u32 flags = cp->flags; - - /* when timer already started, silently drop the packet.*/ - if (timer_pending(&cp->timer)) - __ip_vs_conn_put(cp); - else - ip_vs_conn_put(cp); + if (!old_ct) + cp->flags &= ~IP_VS_CONN_F_NFCT; - if (sysctl_expire_nodest_conn(ipvs) && - !(flags & IP_VS_CONN_F_ONE_PACKET)) { - /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + if (old_ct) + return NF_DROP; + cp = NULL; + } else { + __ip_vs_conn_put(cp); + return NF_DROP; } + } - return NF_DROP; + if (unlikely(!cp)) { + int v; + + if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) + return v; } + IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); + ip_vs_in_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) @@ -2256,7 +2251,7 @@ ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, #endif -static const struct nf_hook_ops ip_vs_ops[] = { +static const struct nf_hook_ops ip_vs_ops4[] = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply4, @@ -2302,7 +2297,10 @@ static const struct nf_hook_ops ip_vs_ops[] = { .hooknum = NF_INET_FORWARD, .priority = 100, }, +}; + #ifdef CONFIG_IP_VS_IPV6 +static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_reply6, @@ -2348,8 +2346,64 @@ static const struct nf_hook_ops ip_vs_ops[] = { .hooknum = NF_INET_FORWARD, .priority = 100, }, -#endif }; +#endif + +int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af) +{ + const struct nf_hook_ops *ops; + unsigned int count; + unsigned int afmask; + int ret = 0; + + if (af == AF_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + ops = ip_vs_ops6; + count = ARRAY_SIZE(ip_vs_ops6); + afmask = 2; +#else + return -EINVAL; +#endif + } else { + ops = ip_vs_ops4; + count = ARRAY_SIZE(ip_vs_ops4); + afmask = 1; + } + + if (!(ipvs->hooks_afmask & afmask)) { + ret = nf_register_net_hooks(ipvs->net, ops, count); + if (ret >= 0) + ipvs->hooks_afmask |= afmask; + } + return ret; +} + +void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af) +{ + const struct nf_hook_ops *ops; + unsigned int count; + unsigned int afmask; + + if (af == AF_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + ops = ip_vs_ops6; + count = ARRAY_SIZE(ip_vs_ops6); + afmask = 2; +#else + return; +#endif + } else { + ops = ip_vs_ops4; + count = ARRAY_SIZE(ip_vs_ops4); + afmask = 1; + } + + if (ipvs->hooks_afmask & afmask) { + nf_unregister_net_hooks(ipvs->net, ops, count); + ipvs->hooks_afmask &= ~afmask; + } +} + /* * Initialize IP Virtual Server netns mem. */ @@ -2425,19 +2479,6 @@ static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) } } -static int __net_init __ip_vs_dev_init(struct net *net) -{ - int ret; - - ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - if (ret < 0) - goto hook_fail; - return 0; - -hook_fail: - return ret; -} - static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) { struct netns_ipvs *ipvs; @@ -2446,7 +2487,8 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) EnterFunction(2); list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); - nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ip_vs_unregister_hooks(ipvs, AF_INET); + ip_vs_unregister_hooks(ipvs, AF_INET6); ipvs->enable = 0; /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); @@ -2462,7 +2504,6 @@ static struct pernet_operations ipvs_core_ops = { }; static struct pernet_operations ipvs_core_dev_ops = { - .init = __ip_vs_dev_init, .exit_batch = __ip_vs_dev_cleanup_batch, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 412656c34f20..678c5b14841c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -210,6 +210,17 @@ static void update_defense_level(struct netns_ipvs *ipvs) local_bh_enable(); } +/* Handler for delayed work for expiring no + * destination connections + */ +static void expire_nodest_conn_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs; + + ipvs = container_of(work, struct netns_ipvs, + expire_nodest_conn_work.work); + ip_vs_expire_nodest_conn_flush(ipvs); +} /* * Timer for checking the defense @@ -224,7 +235,8 @@ static void defense_work_handler(struct work_struct *work) update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) ip_vs_random_dropentry(ipvs); - schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + queue_delayed_work(system_long_wq, &ipvs->defense_work, + DEFENSE_TIMER_PERIOD); } #endif @@ -1163,6 +1175,12 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, list_add(&dest->t_list, &ipvs->dest_trash); dest->idle_start = 0; spin_unlock_bh(&ipvs->dest_trash_lock); + + /* Queue up delayed work to expire all no destination connections. + * No-op when CONFIG_SYSCTL is disabled. + */ + if (!cleanup) + ip_vs_enqueue_expire_nodest_conns(ipvs); } @@ -1272,6 +1290,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; + int ret_hooks = -1; /* increase the module use count */ if (!ip_vs_use_count_inc()) @@ -1313,6 +1332,14 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, } #endif + if ((u->af == AF_INET && !ipvs->num_services) || + (u->af == AF_INET6 && !ipvs->num_services6)) { + ret = ip_vs_register_hooks(ipvs, u->af); + if (ret < 0) + goto out_err; + ret_hooks = ret; + } + svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); @@ -1374,6 +1401,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; + else if (svc->af == AF_INET6) + ipvs->num_services6++; /* Hash the service into the service table */ ip_vs_svc_hash(svc); @@ -1385,6 +1414,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, out_err: + if (ret_hooks >= 0) + ip_vs_unregister_hooks(ipvs, u->af); if (svc != NULL) { ip_vs_unbind_scheduler(svc, sched); ip_vs_service_free(svc); @@ -1500,9 +1531,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) struct ip_vs_pe *old_pe; struct netns_ipvs *ipvs = svc->ipvs; - /* Count only IPv4 services for old get/setsockopt interface */ - if (svc->af == AF_INET) + if (svc->af == AF_INET) { ipvs->num_services--; + if (!ipvs->num_services) + ip_vs_unregister_hooks(ipvs, svc->af); + } else if (svc->af == AF_INET6) { + ipvs->num_services6--; + if (!ipvs->num_services6) + ip_vs_unregister_hooks(ipvs, svc->af); + } ip_vs_stop_estimator(svc->ipvs, &svc->stats); @@ -2414,7 +2451,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, } static int -do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) { struct net *net = sock_net(sk); int ret; @@ -2438,7 +2475,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) return -EINVAL; } - if (copy_from_user(arg, user, len) != 0) + if (copy_from_sockptr(arg, ptr, len) != 0) return -EFAULT; /* Handle daemons since they have another lock */ @@ -4063,7 +4100,12 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_tbl = tbl; /* Schedule defense work */ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); - schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + queue_delayed_work(system_long_wq, &ipvs->defense_work, + DEFENSE_TIMER_PERIOD); + + /* Init delayed work for expiring no dest conn */ + INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, + expire_nodest_conn_handler); return 0; } @@ -4072,6 +4114,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; + cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f33d72c5b06e..e38b60fc183e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1006,7 +1006,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) * * @skb: skb that causes the clash * @h: tuplehash of the clashing entry already in table - * @hash_reply: hash slot for reply direction + * @reply_hash: hash slot for reply direction * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. @@ -1344,18 +1344,6 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) return false; } -#define DAY (86400 * HZ) - -/* Set an arbitrary timeout large enough not to ever expire, this save - * us a check for the IPS_OFFLOAD_BIT from the packet path via - * nf_ct_is_expired(). - */ -static void nf_ct_offload_timeout(struct nf_conn *ct) -{ - if (nf_ct_expires(ct) < DAY / 2) - ct->timeout = nfct_time_stamp + DAY; -} - static void gc_worker(struct work_struct *work) { unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 573cb4481481..e697a824b001 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -257,15 +257,15 @@ static unsigned int get_uint(struct bitstr *bs, int b) case 4: v |= *bs->cur++; v <<= 8; - /* fall through */ + fallthrough; case 3: v |= *bs->cur++; v <<= 8; - /* fall through */ + fallthrough; case 2: v |= *bs->cur++; v <<= 8; - /* fall through */ + fallthrough; case 1: v |= *bs->cur++; break; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index a0560d175a7f..95f79980348c 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -610,7 +610,7 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto) switch (nfproto) { case NFPROTO_BRIDGE: nf_ct_netns_do_put(net, NFPROTO_BRIDGE); - /* fall through */ + fallthrough; case NFPROTO_INET: nf_ct_netns_do_put(net, NFPROTO_IPV4); nf_ct_netns_do_put(net, NFPROTO_IPV6); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 1926fd56df56..6892e497781c 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -900,7 +900,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, return -NF_REPEAT; return NF_DROP; } - /* Fall through */ + fallthrough; case TCP_CONNTRACK_IGNORE: /* Ignored packets: * diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 6a26299cb064..a604f43e3e6b 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -60,7 +60,7 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, ntohs(tuple->src.u.tcp.port), ntohs(tuple->dst.u.tcp.port)); break; - case IPPROTO_UDPLITE: /* fallthrough */ + case IPPROTO_UDPLITE: case IPPROTO_UDP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.udp.port), diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index b1eb5272b379..4f7a567c536e 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -243,6 +243,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } + nf_ct_offload_timeout(flow->ct); + if (nf_flowtable_hw_offload(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 5fff1e040168..2a6993fa40d7 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -964,7 +964,7 @@ static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); - return flow_indr_dev_setup_offload(dev, TC_SETUP_FT, flowtable, bo, + return flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_FT, flowtable, bo, nf_flow_table_indr_cleanup); } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index bfc555fcbc72..ea923f8cf9c4 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -408,7 +408,7 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, static const unsigned int max_attempts = 128; switch (tuple->dst.protonum) { - case IPPROTO_ICMP: /* fallthrough */ + case IPPROTO_ICMP: case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; @@ -442,11 +442,11 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, } goto find_free_id; #endif - case IPPROTO_UDP: /* fallthrough */ - case IPPROTO_UDPLITE: /* fallthrough */ - case IPPROTO_TCP: /* fallthrough */ - case IPPROTO_SCTP: /* fallthrough */ - case IPPROTO_DCCP: /* fallthrough */ + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_TCP: + case IPPROTO_SCTP: + case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c index 46cb3786e0ec..34afcd03b6f6 100644 --- a/net/netfilter/nf_sockopt.c +++ b/net/netfilter/nf_sockopt.c @@ -89,78 +89,32 @@ out: return ops; } -/* Call get/setsockopt() */ -static int nf_sockopt(struct sock *sk, u_int8_t pf, int val, - char __user *opt, int *len, int get) +int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, sockptr_t opt, + unsigned int len) { struct nf_sockopt_ops *ops; int ret; - ops = nf_sockopt_find(sk, pf, val, get); + ops = nf_sockopt_find(sk, pf, val, 0); if (IS_ERR(ops)) return PTR_ERR(ops); - - if (get) - ret = ops->get(sk, val, opt, len); - else - ret = ops->set(sk, val, opt, *len); - + ret = ops->set(sk, val, opt, len); module_put(ops->owner); return ret; } - -int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, - unsigned int len) -{ - return nf_sockopt(sk, pf, val, opt, &len, 0); -} EXPORT_SYMBOL(nf_setsockopt); int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len) { - return nf_sockopt(sk, pf, val, opt, len, 1); -} -EXPORT_SYMBOL(nf_getsockopt); - -#ifdef CONFIG_COMPAT -static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val, - char __user *opt, int *len, int get) -{ struct nf_sockopt_ops *ops; int ret; - ops = nf_sockopt_find(sk, pf, val, get); + ops = nf_sockopt_find(sk, pf, val, 1); if (IS_ERR(ops)) return PTR_ERR(ops); - - if (get) { - if (ops->compat_get) - ret = ops->compat_get(sk, val, opt, len); - else - ret = ops->get(sk, val, opt, len); - } else { - if (ops->compat_set) - ret = ops->compat_set(sk, val, opt, *len); - else - ret = ops->set(sk, val, opt, *len); - } - + ret = ops->get(sk, val, opt, len); module_put(ops->owner); return ret; } - -int compat_nf_setsockopt(struct sock *sk, u_int8_t pf, - int val, char __user *opt, unsigned int len) -{ - return compat_nf_sockopt(sk, pf, val, opt, &len, 0); -} -EXPORT_SYMBOL(compat_nf_setsockopt); - -int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, - int val, char __user *opt, int *len) -{ - return compat_nf_sockopt(sk, pf, val, opt, len, 1); -} -EXPORT_SYMBOL(compat_nf_getsockopt); -#endif +EXPORT_SYMBOL(nf_getsockopt); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index ebcdc8e54476..9cca35d22927 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -704,8 +704,7 @@ ipv4_synproxy_hook(void *priv, struct sk_buff *skb, nf_ct_seqadj_init(ct, ctinfo, 0); synproxy->tsoff = 0; this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ + fallthrough; case TCP_CONNTRACK_SYN_SENT: if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; @@ -1128,8 +1127,7 @@ ipv6_synproxy_hook(void *priv, struct sk_buff *skb, nf_ct_seqadj_init(ct, ctinfo, 0); synproxy->tsoff = 0; this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ + fallthrough; case TCP_CONNTRACK_SYN_SENT: if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2b3862ea0505..d878e34e3354 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -269,9 +269,15 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) if (trans == NULL) return ERR_PTR(-ENOMEM); - if (msg_type == NFT_MSG_NEWCHAIN) + if (msg_type == NFT_MSG_NEWCHAIN) { nft_activate_next(ctx->net, ctx->chain); + if (ctx->nla[NFTA_CHAIN_ID]) { + nft_trans_chain_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); + } + } + list_add_tail(&trans->list, &ctx->net->nft.commit_list); return trans; } @@ -1049,6 +1055,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_is_bound(chain)) + continue; + ctx->chain = chain; err = nft_delrule_by_chain(ctx); @@ -1091,6 +1100,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_is_bound(chain)) + continue; + ctx->chain = chain; err = nft_delchain(ctx); @@ -1273,6 +1285,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, + [NFTA_CHAIN_ID] = { .type = NLA_U32 }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { @@ -1405,13 +1418,12 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, lockdep_commit_lock_is_held(net)); if (nft_dump_stats(skb, stats)) goto nla_put_failure; - - if ((chain->flags & NFT_CHAIN_HW_OFFLOAD) && - nla_put_be32(skb, NFTA_CHAIN_FLAGS, - htonl(NFT_CHAIN_HW_OFFLOAD))) - goto nla_put_failure; } + if (chain->flags && + nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) goto nla_put_failure; @@ -1625,7 +1637,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) kvfree(chain->rules_next); } -static void nf_tables_chain_destroy(struct nft_ctx *ctx) +void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; struct nft_hook *hook, *next; @@ -1907,7 +1919,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, nft_basechain_hook_init(&basechain->ops, family, hook, chain); } - chain->flags |= NFT_BASE_CHAIN | flags; + chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && nft_chain_offload_priority(basechain) < 0) @@ -1918,6 +1930,22 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, return 0; } +static int nft_chain_add(struct nft_table *table, struct nft_chain *chain) +{ + int err; + + err = rhltable_insert_key(&table->chains_ht, chain->name, + &chain->rhlhead, nft_chain_ht_params); + if (err) + return err; + + list_add_tail_rcu(&chain->list, &table->chains); + + return 0; +} + +static u64 chain_id; + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, u32 flags) { @@ -1926,6 +1954,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_base_chain *basechain; struct nft_stats __percpu *stats; struct net *net = ctx->net; + char name[NFT_NAME_MAXLEN]; struct nft_trans *trans; struct nft_chain *chain; struct nft_rule **rules; @@ -1937,6 +1966,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; + if (flags & NFT_CHAIN_BINDING) + return -EOPNOTSUPP; + err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) return err; @@ -1966,16 +1998,33 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, return err; } } else { + if (flags & NFT_CHAIN_BASE) + return -EINVAL; + if (flags & NFT_CHAIN_HW_OFFLOAD) + return -EOPNOTSUPP; + chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) return -ENOMEM; + + chain->flags = flags; } ctx->chain = chain; INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; - chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + + if (nla[NFTA_CHAIN_NAME]) { + chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + } else { + if (!(flags & NFT_CHAIN_BINDING)) + return -EINVAL; + + snprintf(name, sizeof(name), "__chain%llu", ++chain_id); + chain->name = kstrdup(name, GFP_KERNEL); + } + if (!chain->name) { err = -ENOMEM; goto err1; @@ -1995,16 +2044,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (err < 0) goto err1; - err = rhltable_insert_key(&table->chains_ht, chain->name, - &chain->rhlhead, nft_chain_ht_params); - if (err) - goto err2; - trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); - rhltable_remove(&table->chains_ht, &chain->rhlhead, - nft_chain_ht_params); goto err2; } @@ -2012,8 +2054,13 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; + err = nft_chain_add(table, chain); + if (err < 0) { + nft_trans_destroy(trans); + goto err2; + } + table->use++; - list_add_tail_rcu(&chain->list, &table->chains); return 0; err2: @@ -2061,7 +2108,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nla[NFTA_CHAIN_HOOK]) { if (!nft_is_base_chain(chain)) - return -EBUSY; + return -EEXIST; err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family, false); @@ -2071,21 +2118,21 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, basechain = nft_base_chain(chain); if (basechain->type != hook.type) { nft_chain_release_hook(&hook); - return -EBUSY; + return -EEXIST; } if (ctx->family == NFPROTO_NETDEV) { if (!nft_hook_list_equal(&basechain->hook_list, &hook.list)) { nft_chain_release_hook(&hook); - return -EBUSY; + return -EEXIST; } } else { ops = &basechain->ops; if (ops->hooknum != hook.num || ops->priority != hook.priority) { nft_chain_release_hook(&hook); - return -EBUSY; + return -EEXIST; } } nft_chain_release_hook(&hook); @@ -2157,6 +2204,22 @@ err: return err; } +static struct nft_chain *nft_chain_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans *trans; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + struct nft_chain *chain = trans->ctx.chain; + + if (trans->msg_type == NFT_MSG_NEWCHAIN && + id == nft_trans_chain_id(trans)) + return chain; + } + return ERR_PTR(-ENOENT); +} + static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2165,9 +2228,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct nft_chain *chain = NULL; const struct nlattr *attr; struct nft_table *table; - struct nft_chain *chain; u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; @@ -2192,7 +2255,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } attr = nla[NFTA_CHAIN_HANDLE]; - } else { + } else if (nla[NFTA_CHAIN_NAME]) { chain = nft_chain_lookup(net, table, attr, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) { @@ -2201,6 +2264,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } chain = NULL; } + } else if (!nla[NFTA_CHAIN_ID]) { + return -EINVAL; } if (nla[NFTA_CHAIN_POLICY]) { @@ -2231,6 +2296,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, else if (chain) flags = chain->flags; + if (flags & ~NFT_CHAIN_FLAGS) + return -EOPNOTSUPP; + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { @@ -2241,7 +2309,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - flags |= chain->flags & NFT_BASE_CHAIN; + flags |= chain->flags & NFT_CHAIN_BASE; return nf_tables_updchain(&ctx, genmask, policy, flags); } @@ -2318,7 +2386,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, /** * nft_register_expr - register nf_tables expr type - * @ops: expr type + * @type: expr type * * Registers the expr type for use with nf_tables. Returns zero on * success or a negative errno code otherwise. @@ -2337,7 +2405,7 @@ EXPORT_SYMBOL_GPL(nft_register_expr); /** * nft_unregister_expr - unregister nf_tables expr type - * @ops: expr type + * @type: expr type * * Unregisters the expr typefor use with nf_tables. */ @@ -2452,6 +2520,7 @@ nla_put_failure: struct nft_expr_info { const struct nft_expr_ops *ops; + const struct nlattr *attr; struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; }; @@ -2499,7 +2568,9 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, } else ops = type->ops; + info->attr = nla; info->ops = ops; + return 0; err1: @@ -2635,6 +2706,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, [NFTA_RULE_ID] = { .type = NLA_U32 }, [NFTA_RULE_POSITION_ID] = { .type = NLA_U32 }, + [NFTA_RULE_CHAIN_ID] = { .type = NLA_U32 }, }; static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, @@ -2961,8 +3033,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -static void nf_tables_rule_release(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); @@ -3053,10 +3124,24 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); - if (IS_ERR(chain)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); - return PTR_ERR(chain); + if (nla[NFTA_RULE_CHAIN]) { + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], + genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); + return PTR_ERR(chain); + } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; + + } else if (nla[NFTA_RULE_CHAIN_ID]) { + chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]); + return PTR_ERR(chain); + } + } else { + return -EINVAL; } if (nla[NFTA_RULE_HANDLE]) { @@ -3155,8 +3240,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, expr = nft_expr_first(rule); for (i = 0; i < n; i++) { err = nf_tables_newexpr(&ctx, &info[i], expr); - if (err < 0) + if (err < 0) { + NL_SET_BAD_ATTR(extack, info[i].attr); goto err2; + } if (info[i].ops->validate) nft_validate_state_update(net, NFT_VALIDATE_NEED); @@ -3268,6 +3355,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; } nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); @@ -4326,7 +4415,7 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: set->use--; - /* fall through */ + fallthrough; default: nf_tables_unbind_set(ctx, set, binding, phase == NFT_TRANS_COMMIT); @@ -5220,10 +5309,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ - nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) { - err = -EBUSY; + nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) goto err_element_clash; - } if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && memcmp(nft_set_ext_data(ext), @@ -5231,7 +5318,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) && *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2))) - err = -EBUSY; + goto err_element_clash; else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; } else if (err == -ENOTEMPTY) { @@ -5328,11 +5415,24 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, */ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { + struct nft_chain *chain; + struct nft_rule *rule; + if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use++; + chain = data->verdict.chain; + chain->use++; + + if (!nft_chain_is_bound(chain)) + break; + + chain->table->use++; + list_for_each_entry(rule, &chain->rules, list) + chain->use++; + + nft_chain_add(chain->table, chain); break; } } @@ -5545,7 +5645,7 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, /** * nft_register_obj- register nf_tables stateful object type - * @obj: object type + * @obj_type: object type * * Registers the object type for use with nf_tables. Returns zero on * success or a negative errno code otherwise. @@ -5564,7 +5664,7 @@ EXPORT_SYMBOL_GPL(nft_register_obj); /** * nft_unregister_obj - unregister nf_tables object type - * @obj: object type + * @obj_type: object type * * Unregisters the object type for use with nf_tables. */ @@ -6243,7 +6343,7 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: flowtable->use--; - /* fall through */ + fallthrough; default: return; } @@ -6405,7 +6505,7 @@ static int nft_register_flowtable_net_hooks(struct net *net, list_for_each_entry(hook2, &ft->hook_list, list) { if (hook->ops.dev == hook2->ops.dev && hook->ops.pf == hook2->ops.pf) { - err = -EBUSY; + err = -EEXIST; goto err_unregister_net_hooks; } } @@ -7264,7 +7364,7 @@ static int nf_tables_validate(struct net *net) break; case NFT_VALIDATE_NEED: nft_validate_state_update(net, NFT_VALIDATE_DO); - /* fall through */ + fallthrough; case NFT_VALIDATE_DO: list_for_each_entry(table, &net->nft.tables, list) { if (nft_table_validate(net, table) < 0) @@ -7402,6 +7502,12 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) } } +void nf_tables_trans_destroy_flush_work(void) +{ + flush_work(&trans_destroy_work); +} +EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); + static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) { struct nft_rule *rule; @@ -7524,7 +7630,7 @@ static void nft_obj_del(struct nft_object *obj) list_del_rcu(&obj->list); } -static void nft_chain_del(struct nft_chain *chain) +void nft_chain_del(struct nft_chain *chain) { struct nft_table *table = chain->table; @@ -7584,9 +7690,9 @@ static void nf_tables_commit_release(struct net *net) spin_unlock(&nf_tables_destroy_list_lock); nf_tables_module_autoload_cleanup(net); - mutex_unlock(&net->nft.commit_mutex); - schedule_work(&trans_destroy_work); + + mutex_unlock(&net->nft.commit_mutex); } static int nf_tables_commit(struct net *net, struct sk_buff *skb) @@ -7875,6 +7981,10 @@ static int __nf_tables_abort(struct net *net, bool autoload) kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { + if (nft_chain_is_bound(trans->ctx.chain)) { + nft_trans_destroy(trans); + break; + } trans->ctx.table->use--; nft_chain_del(trans->ctx.chain); nf_tables_unregister_hook(trans->ctx.net, @@ -8304,6 +8414,7 @@ static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, + [NFTA_VERDICT_CHAIN_ID] = { .type = NLA_U32 }, }; static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, @@ -8333,17 +8444,26 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, default: return -EINVAL; } - /* fall through */ + fallthrough; case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: break; case NFT_JUMP: case NFT_GOTO: - if (!tb[NFTA_VERDICT_CHAIN]) + if (tb[NFTA_VERDICT_CHAIN]) { + chain = nft_chain_lookup(ctx->net, ctx->table, + tb[NFTA_VERDICT_CHAIN], + genmask); + } else if (tb[NFTA_VERDICT_CHAIN_ID]) { + chain = nft_chain_lookup_byid(ctx->net, + tb[NFTA_VERDICT_CHAIN_ID]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } else { return -EINVAL; - chain = nft_chain_lookup(ctx->net, ctx->table, - tb[NFTA_VERDICT_CHAIN], genmask); + } + if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) @@ -8361,10 +8481,23 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, static void nft_verdict_uninit(const struct nft_data *data) { + struct nft_chain *chain; + struct nft_rule *rule; + switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use--; + chain = data->verdict.chain; + chain->use--; + + if (!nft_chain_is_bound(chain)) + break; + + chain->table->use--; + list_for_each_entry(rule, &chain->rules, list) + chain->use--; + + nft_chain_del(chain); break; } } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 96c74c4c7176..587897a2498b 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -213,7 +213,7 @@ next_rule: jumpstack[stackptr].chain = chain; jumpstack[stackptr].rules = rules + 1; stackptr++; - /* fall through */ + fallthrough; case NFT_GOTO: nft_trace_packet(&info, chain, rule, NFT_TRACETYPE_RULE); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index c7cf1cde46de..9ef37c1b7b3b 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -312,7 +312,7 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); - err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, basechain, &bo, + err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo, nft_indr_block_cleanup); if (err < 0) return err; diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5827117f2635..5bfec829c12f 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> - * (C) 2011 Intra2net AG <http://www.intra2net.com> + * (C) 2011 Intra2net AG <https://www.intra2net.com> */ #include <linux/init.h> #include <linux/module.h> diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index da915c224a82..89a381f7f945 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -451,7 +451,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, case IPPROTO_TCP: timeouts = nf_tcp_pernet(net)->timeouts; break; - case IPPROTO_UDP: /* fallthrough */ + case IPPROTO_UDP: case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(net)->timeouts; break; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 8a28c127effc..16f4d84599ac 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -43,7 +43,7 @@ void nft_cmp_eval(const struct nft_expr *expr, case NFT_CMP_LT: if (d == 0) goto mismatch; - /* fall through */ + fallthrough; case NFT_CMP_LTE: if (d > 0) goto mismatch; @@ -51,7 +51,7 @@ void nft_cmp_eval(const struct nft_expr *expr, case NFT_CMP_GT: if (d == 0) goto mismatch; - /* fall through */ + fallthrough; case NFT_CMP_GTE: if (d < 0) goto mismatch; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index aa1a066cb74b..6428856ccbec 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -27,6 +27,8 @@ struct nft_xt_match_priv { void *info; }; +static refcount_t nft_compat_pending_destroy = REFCOUNT_INIT(1); + static int nft_compat_chain_validate_dependency(const struct nft_ctx *ctx, const char *tablename) { @@ -236,6 +238,15 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); + /* xtables matches or targets can have side effects, e.g. + * creation/destruction of /proc files. + * The xt ->destroy functions are run asynchronously from + * work queue. If we have pending invocations we thus + * need to wait for those to finish. + */ + if (refcount_read(&nft_compat_pending_destroy) > 1) + nf_tables_trans_destroy_flush_work(); + ret = xt_check_target(&par, size, proto, inv); if (ret < 0) return ret; @@ -247,6 +258,13 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return 0; } +static void __nft_mt_tg_destroy(struct module *me, const struct nft_expr *expr) +{ + refcount_dec(&nft_compat_pending_destroy); + module_put(me); + kfree(expr->ops); +} + static void nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -262,8 +280,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) if (par.target->destroy != NULL) par.target->destroy(&par); - module_put(me); - kfree(expr->ops); + __nft_mt_tg_destroy(me, expr); } static int nft_extension_dump_info(struct sk_buff *skb, int attr, @@ -494,8 +511,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, if (par.match->destroy != NULL) par.match->destroy(&par); - module_put(me); - kfree(expr->ops); + __nft_mt_tg_destroy(me, expr); } static void @@ -700,6 +716,14 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = { static struct nft_expr_type nft_match_type; +static void nft_mt_tg_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + enum nft_trans_phase phase) +{ + if (phase == NFT_TRANS_COMMIT) + refcount_inc(&nft_compat_pending_destroy); +} + static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -738,6 +762,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, ops->type = &nft_match_type; ops->eval = nft_match_eval; ops->init = nft_match_init; + ops->deactivate = nft_mt_tg_deactivate, ops->destroy = nft_match_destroy; ops->dump = nft_match_dump; ops->validate = nft_match_validate; @@ -828,6 +853,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, ops->size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); ops->init = nft_target_init; ops->destroy = nft_target_destroy; + ops->deactivate = nft_mt_tg_deactivate, ops->dump = nft_target_dump; ops->validate = nft_target_validate; ops->data = target; @@ -891,6 +917,8 @@ static void __exit nft_compat_module_exit(void) nfnetlink_subsys_unregister(&nfnl_compat_subsys); nft_unregister_expr(&nft_target_type); nft_unregister_expr(&nft_match_type); + + WARN_ON_ONCE(refcount_read(&nft_compat_pending_destroy) != 1); } MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 77258af1fce0..322bd674963e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -129,7 +129,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; } #endif - case NFT_CT_BYTES: /* fallthrough */ + case NFT_CT_BYTES: case NFT_CT_PKTS: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); u64 count = 0; @@ -1013,8 +1013,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, help6 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; - case NFPROTO_NETDEV: /* fallthrough */ - case NFPROTO_BRIDGE: /* same */ + case NFPROTO_NETDEV: + case NFPROTO_BRIDGE: case NFPROTO_INET: help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4, priv->l4proto); diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index cfac0964f48d..4dfdaeaf09a5 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -32,7 +32,7 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, unsigned int hooks; switch (priv->result) { - case NFT_FIB_RESULT_OIF: /* fallthrough */ + case NFT_FIB_RESULT_OIF: case NFT_FIB_RESULT_OIFNAME: hooks = (1 << NF_INET_PRE_ROUTING); break; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c7f0ef73d939..c63eb3b17178 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -54,6 +54,23 @@ static int nft_immediate_init(const struct nft_ctx *ctx, if (err < 0) goto err1; + if (priv->dreg == NFT_REG_VERDICT) { + struct nft_chain *chain = priv->data.verdict.chain; + + switch (priv->data.verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + if (nft_chain_is_bound(chain)) { + err = -EBUSY; + goto err1; + } + chain->bound = true; + break; + default: + break; + } + } + return 0; err1: @@ -81,6 +98,39 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } +static void nft_immediate_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_rule *rule, *n; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + + if (priv->dreg != NFT_REG_VERDICT) + return; + + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + + if (!nft_chain_is_bound(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry_safe(rule, n, &chain->rules, list) + nf_tables_rule_release(&chain_ctx, rule); + + nf_tables_chain_destroy(&chain_ctx); + break; + default: + break; + } +} + static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); @@ -170,6 +220,7 @@ static const struct nft_expr_ops nft_imm_ops = { .init = nft_immediate_init, .activate = nft_immediate_activate, .deactivate = nft_immediate_deactivate, + .destroy = nft_immediate_destroy, .dump = nft_immediate_dump, .validate = nft_immediate_validate, .offload = nft_immediate_offload, diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 951b6e87ed5d..7bc6537f3ccb 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -253,7 +253,7 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, return false; break; case NFT_META_IIFGROUP: - if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + if (!nft_meta_store_ifgroup(dest, nft_in(pkt))) return false; break; case NFT_META_OIFGROUP: diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index a7de3a58f553..ed7cb9f747f6 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -467,7 +467,7 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, case IPPROTO_UDP: if (!nft_payload_udp_checksum(skb, pkt->xt.thoff)) return -1; - /* Fall through. */ + fallthrough; case IPPROTO_UDPLITE: *l4csum_offset = offsetof(struct udphdr, check); break; diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 86eafbb0fdd0..61fb7e8afbf0 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -30,7 +30,8 @@ int nft_reject_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT)); + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_PRE_ROUTING)); } EXPORT_SYMBOL_GPL(nft_reject_validate); diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 8c04388296b0..9944523f5c2c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -312,7 +312,7 @@ * Jay Ligatti, Josh Kuhn, and Chris Gage. * Proceedings of the IEEE International Conference on Computer * Communication Networks (ICCCN), August 2010. - * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf + * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf * * [Rottenstreich 2010] * Worst-Case TCAM Rule Expansion @@ -325,7 +325,7 @@ * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, * and Patrick Eugster. * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. - * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf + * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf */ #include <linux/kernel.h> @@ -401,7 +401,7 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, * nft_pipapo_lookup() - Lookup function * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @key: nftables API element representation containing key data * @ext: nftables API extension pointer, filled with matching reference * * For more details, see DOC: Theory of Operation. @@ -1075,7 +1075,7 @@ out: * @m: Matching data, including mapping table * @map: Table of rule maps: array of first rule and amount of rules * in next field a given rule maps to, for each field - * @ext: For last field, nft_set_ext pointer matching rules map to + * @e: For last field, nft_set_ext pointer matching rules map to */ static void pipapo_map(struct nft_pipapo_match *m, union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], @@ -1099,7 +1099,7 @@ static void pipapo_map(struct nft_pipapo_match *m, /** * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results * @clone: Copy of matching data with pending insertions and deletions - * @bsize_max Maximum bucket size, scratch maps cover two buckets + * @bsize_max: Maximum bucket size, scratch maps cover two buckets * * Return: 0 on success, -ENOMEM on failure. */ @@ -1249,8 +1249,6 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, if (err) return err; - this_cpu_write(nft_pipapo_scratch_index, false); - m->bsize_max = bsize_max; } else { put_cpu_ptr(m->scratch); @@ -1449,7 +1447,7 @@ static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, /** * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map * @m: Matching data - * @rulemap Table of rule maps, arrays of first rule and amount of rules + * @rulemap: Table of rule maps, arrays of first rule and amount of rules * in next field a given entry maps to, for each field * * For each rule in lookup table buckets mapping to this set of rules, drop diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index 51b454d8fa9c..cedf47ab3c6f 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -25,7 +25,7 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, skb->ip_summed = CHECKSUM_UNNECESSARY; break; } - /* fall through */ + fallthrough; case CHECKSUM_NONE: if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP) skb->csum = 0; @@ -51,7 +51,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip_checksum(skb, hook, dataoff, protocol); - /* fall through */ + fallthrough; case CHECKSUM_NONE: skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, skb->len - dataoff, 0); @@ -79,7 +79,7 @@ __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, skb->ip_summed = CHECKSUM_UNNECESSARY; break; } - /* fall through */ + fallthrough; case CHECKSUM_NONE: skb->csum = ~csum_unfold( csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -106,7 +106,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip6_checksum(skb, hook, dataoff, protocol); - /* fall through */ + fallthrough; case CHECKSUM_NONE: hsum = skb_checksum(skb, 0, dataoff, 0); skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 9ad8f3ff66f5..af22dbe85e2c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1028,34 +1028,34 @@ int xt_check_target(struct xt_tgchk_param *par, EXPORT_SYMBOL_GPL(xt_check_target); /** - * xt_copy_counters_from_user - copy counters and metadata from userspace + * xt_copy_counters - copy counters and metadata from a sockptr_t * - * @user: src pointer to userspace memory + * @arg: src sockptr * @len: alleged size of userspace memory * @info: where to store the xt_counters_info metadata - * @compat: true if we setsockopt call is done by 32bit task on 64bit kernel * * Copies counter meta data from @user and stores it in @info. * * vmallocs memory to hold the counters, then copies the counter data * from @user to the new memory and returns a pointer to it. * - * If @compat is true, @info gets converted automatically to the 64bit - * representation. + * If called from a compat syscall, @info gets converted automatically to the + * 64bit representation. * * The metadata associated with the counters is stored in @info. * * Return: returns pointer that caller has to test via IS_ERR(). * If IS_ERR is false, caller has to vfree the pointer. */ -void *xt_copy_counters_from_user(const void __user *user, unsigned int len, - struct xt_counters_info *info, bool compat) +void *xt_copy_counters(sockptr_t arg, unsigned int len, + struct xt_counters_info *info) { + size_t offset; void *mem; u64 size; #ifdef CONFIG_COMPAT - if (compat) { + if (in_compat_syscall()) { /* structures only differ in size due to alignment */ struct compat_xt_counters_info compat_tmp; @@ -1063,12 +1063,12 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, return ERR_PTR(-EINVAL); len -= sizeof(compat_tmp); - if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0) + if (copy_from_sockptr(&compat_tmp, arg, sizeof(compat_tmp)) != 0) return ERR_PTR(-EFAULT); memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1); info->num_counters = compat_tmp.num_counters; - user += sizeof(compat_tmp); + offset = sizeof(compat_tmp); } else #endif { @@ -1076,10 +1076,10 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, return ERR_PTR(-EINVAL); len -= sizeof(*info); - if (copy_from_user(info, user, sizeof(*info)) != 0) + if (copy_from_sockptr(info, arg, sizeof(*info)) != 0) return ERR_PTR(-EFAULT); - user += sizeof(*info); + offset = sizeof(*info); } info->name[sizeof(info->name) - 1] = '\0'; @@ -1093,13 +1093,13 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (!mem) return ERR_PTR(-ENOMEM); - if (copy_from_user(mem, user, len) == 0) + if (copy_from_sockptr_offset(mem, arg, offset, len) == 0) return mem; vfree(mem); return ERR_PTR(-EFAULT); } -EXPORT_SYMBOL_GPL(xt_copy_counters_from_user); +EXPORT_SYMBOL_GPL(xt_copy_counters); #ifdef CONFIG_COMPAT int xt_compat_target_offset(const struct xt_target *target) @@ -1572,7 +1572,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, trav->curr = trav->curr->next; if (trav->curr != trav->head) break; - /* fall through */ + fallthrough; default: return NULL; } diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index a5c8b653476a..76acecf3e757 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -6,7 +6,7 @@ * with the SECMARK target and state match. * * Based somewhat on CONNMARK: - * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com> * by Henrik Nordstrom <hno@marasystems.com> * * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com> diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index eec2f3a88d73..e5ebc0810675 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -2,7 +2,7 @@ /* * xt_connmark - Netfilter module to operate on connection marks * - * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com> * by Henrik Nordstrom <hno@marasystems.com> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * Jan Engelhardt <jengelh@medozas.de> diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 5aab6df74e0f..a97c2259bbc8 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> - * (C) 2011 Intra2net AG <http://www.intra2net.com> + * (C) 2011 Intra2net AG <https://www.intra2net.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 67cb98489415..6aa12d0f54e2 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -5,7 +5,7 @@ * based on ipt_time by Fabrice MARIE <fabrice@netfilter.org> * This is a module which is used for time matching * It is using some modified code from dietlibc (localtime() function) - * that you can find at http://www.fefe.de/dietlibc/ + * that you can find at https://www.fefe.de/dietlibc/ * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. */ diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index a1f2320ecc16..d07de2c0fbc7 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -92,7 +92,7 @@ static void netlbl_domhsh_free_entry(struct rcu_head *entry) /** * netlbl_domhsh_hash - Hashing function for the domain hash table - * @domain: the domain name to hash + * @key: the domain name to hash * * Description: * This is the hashing function for the domain hash table, it returns the diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4f2c3b14ddbf..b5f30d7d30d0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -60,6 +60,7 @@ #include <linux/genetlink.h> #include <linux/net_namespace.h> #include <linux/nospec.h> +#include <linux/btf_ids.h> #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -1620,7 +1621,7 @@ static void netlink_update_socket_mc(struct netlink_sock *nlk, } static int netlink_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); @@ -1631,7 +1632,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; if (optlen >= sizeof(int) && - get_user(val, (unsigned int __user *)optval)) + copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { @@ -2803,21 +2804,29 @@ static const struct rhashtable_params netlink_rhashtable_params = { }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) -static const struct bpf_iter_reg netlink_reg_info = { - .target = "netlink", +BTF_ID_LIST(btf_netlink_sock_id) +BTF_ID(struct, netlink_sock) + +static const struct bpf_iter_seq_info netlink_seq_info = { .seq_ops = &netlink_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct nl_seq_iter), +}; + +static struct bpf_iter_reg netlink_reg_info = { + .target = "netlink", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__netlink, sk), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &netlink_seq_info, }; static int __init bpf_iter_register(void) { + netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id; return bpf_iter_reg_target(&netlink_reg_info); } #endif diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index f90ef6934b8f..6d16e1ab1a8a 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -294,7 +294,7 @@ void nr_destroy_socket(struct sock *sk) */ static int nr_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); @@ -306,7 +306,7 @@ static int nr_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(unsigned int)) return -EINVAL; - if (get_user(opt, (unsigned int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(unsigned int))) return -EFAULT; switch (optname) { diff --git a/net/nfc/core.c b/net/nfc/core.c index c5f9c3ee82f8..eb377f87bcae 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -704,7 +704,6 @@ EXPORT_SYMBOL(nfc_tm_deactivated); * nfc_alloc_send_skb - allocate a skb for data exchange responses * * @size: size to allocate - * @gfp: gfp flags */ struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk, unsigned int flags, unsigned int size, @@ -749,7 +748,7 @@ EXPORT_SYMBOL(nfc_alloc_recv_skb); * * @dev: The nfc device that found the targets * @targets: array of nfc targets found - * @ntargets: targets array size + * @n_targets: targets array size * * The device driver must call this function when one or many nfc targets * are found. After calling this function, the device driver must stop diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 28604414dec1..d257ed3b732a 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -218,7 +218,7 @@ error: } static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); @@ -241,7 +241,7 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -263,7 +263,7 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (get_user(opt, (u32 __user *) optval)) { + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } @@ -921,8 +921,6 @@ static const struct proto_ops llcp_rawsock_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = sock_no_sendmsg, .recvmsg = llcp_sock_recvmsg, .mmap = sock_no_mmap, diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 78ea8c94dcba..741da8f81c2b 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1182,7 +1182,7 @@ EXPORT_SYMBOL(nci_free_device); /** * nci_register_device - register a nci device in the nfc subsystem * - * @dev: The nci device to register + * @ndev: The nci device to register */ int nci_register_device(struct nci_dev *ndev) { @@ -1249,7 +1249,7 @@ EXPORT_SYMBOL(nci_register_device); /** * nci_unregister_device - unregister a nci device in the nfc subsystem * - * @dev: The nci device to unregister + * @ndev: The nci device to unregister */ void nci_unregister_device(struct nci_dev *ndev) { diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index ba5ffd3badd3..b2061b6746ea 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -276,8 +276,6 @@ static const struct proto_ops rawsock_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = rawsock_sendmsg, .recvmsg = rawsock_recvmsg, .mmap = sock_no_mmap, @@ -296,8 +294,6 @@ static const struct proto_ops rawsock_raw_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = sock_no_sendmsg, .recvmsg = rawsock_recvmsg, .mmap = sock_no_mmap, diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 4340f25fe390..98d393e70de3 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -276,10 +276,6 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) ovs_ct_update_key(skb, NULL, key, false, false); } -#define IN6_ADDR_INITIALIZER(ADDR) \ - { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \ - (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] } - int ovs_ct_put_key(const struct sw_flow_key *swkey, const struct sw_flow_key *output, struct sk_buff *skb) { @@ -301,24 +297,30 @@ int ovs_ct_put_key(const struct sw_flow_key *swkey, if (swkey->ct_orig_proto) { if (swkey->eth.type == htons(ETH_P_IP)) { - struct ovs_key_ct_tuple_ipv4 orig = { - output->ipv4.ct_orig.src, - output->ipv4.ct_orig.dst, - output->ct.orig_tp.src, - output->ct.orig_tp.dst, - output->ct_orig_proto, - }; + struct ovs_key_ct_tuple_ipv4 orig; + + memset(&orig, 0, sizeof(orig)); + orig.ipv4_src = output->ipv4.ct_orig.src; + orig.ipv4_dst = output->ipv4.ct_orig.dst; + orig.src_port = output->ct.orig_tp.src; + orig.dst_port = output->ct.orig_tp.dst; + orig.ipv4_proto = output->ct_orig_proto; + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, sizeof(orig), &orig)) return -EMSGSIZE; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - struct ovs_key_ct_tuple_ipv6 orig = { - IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src), - IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst), - output->ct.orig_tp.src, - output->ct.orig_tp.dst, - output->ct_orig_proto, - }; + struct ovs_key_ct_tuple_ipv6 orig; + + memset(&orig, 0, sizeof(orig)); + memcpy(orig.ipv6_src, output->ipv6.ct_orig.src.s6_addr32, + sizeof(orig.ipv6_src)); + memcpy(orig.ipv6_dst, output->ipv6.ct_orig.dst.s6_addr32, + sizeof(orig.ipv6_dst)); + orig.src_port = output->ct.orig_tp.src; + orig.dst_port = output->ct.orig_tp.dst; + orig.ipv6_proto = output->ct_orig_proto; + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, sizeof(orig), &orig)) return -EMSGSIZE; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 94b024534987..42f8cc70bb2c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -130,6 +130,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *, uint32_t cutlen); +static void ovs_dp_masks_rebalance(struct work_struct *work); + /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { @@ -223,13 +225,14 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; + u32 n_cache_hit; int error; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), - &n_mask_hit); + &n_mask_hit, &n_cache_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -260,6 +263,7 @@ out: u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; + stats->n_cache_hit += n_cache_hit; u64_stats_update_end(&stats->syncp); } @@ -697,6 +701,7 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; mega_stats->n_mask_hit += local_stats.n_mask_hit; + mega_stats->n_cache_hit += local_stats.n_cache_hit; } } @@ -1493,6 +1498,7 @@ static size_t ovs_dp_cmd_msg_size(void) msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */ return msgsize; } @@ -1530,6 +1536,10 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; + if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, + ovs_flow_tbl_masks_cache_size(&dp->table))) + goto nla_put_failure; + genlmsg_end(skb, ovs_header); return 0; @@ -1594,6 +1604,16 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) #endif } + if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) { + int err; + u32 cache_size; + + cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); + err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size); + if (err) + return err; + } + dp->user_features = user_features; if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) @@ -1882,6 +1902,8 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, + [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, + PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), }; static const struct genl_ops dp_datapath_genl_ops[] = { @@ -2338,6 +2360,23 @@ out: return skb->len; } +static void ovs_dp_masks_rebalance(struct work_struct *work) +{ + struct ovs_net *ovs_net = container_of(work, struct ovs_net, + masks_rebalance.work); + struct datapath *dp; + + ovs_lock(); + + list_for_each_entry(dp, &ovs_net->dps, list_node) + ovs_flow_masks_rebalance(&dp->table); + + ovs_unlock(); + + schedule_delayed_work(&ovs_net->masks_rebalance, + msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); +} + static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, @@ -2432,6 +2471,9 @@ static int __net_init ovs_init_net(struct net *net) INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); + INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance); + schedule_delayed_work(&ovs_net->masks_rebalance, + msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); return ovs_ct_init(net); } @@ -2486,6 +2528,7 @@ static void __net_exit ovs_exit_net(struct net *dnet) ovs_unlock(); + cancel_delayed_work_sync(&ovs_net->masks_rebalance); cancel_work_sync(&ovs_net->dp_notify_work); } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 2016dd107939..38f7d3e66ca6 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -20,8 +20,9 @@ #include "meter.h" #include "vport-internal_dev.h" -#define DP_MAX_PORTS USHRT_MAX -#define DP_VPORT_HASH_BUCKETS 1024 +#define DP_MAX_PORTS USHRT_MAX +#define DP_VPORT_HASH_BUCKETS 1024 +#define DP_MASKS_REBALANCE_INTERVAL 4000 /** * struct dp_stats_percpu - per-cpu packet processing statistics for a given @@ -37,12 +38,15 @@ * @n_mask_hit: Number of masks looked up for flow match. * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked * up per packet. + * @n_cache_hit: The number of received packets that had their mask found using + * the mask cache. */ struct dp_stats_percpu { u64 n_hit; u64 n_missed; u64 n_lost; u64 n_mask_hit; + u64 n_cache_hit; struct u64_stats_sync syncp; }; @@ -131,6 +135,7 @@ struct dp_upcall_info { struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; + struct delayed_work masks_rebalance; #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_ct_limit_info *ct_limit_info; #endif diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 9d375e74b607..03942c30d83e 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -890,6 +890,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, if (static_branch_unlikely(&tc_recirc_sharing_support)) { tc_ext = skb_ext_find(skb, TC_SKB_EXT); key->recirc_id = tc_ext ? tc_ext->chain : 0; + OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0; } else { key->recirc_id = 0; } diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 79252d4887ff..9d3e50c4d29f 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1763,11 +1763,11 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * does not include any don't care bit. * @net: Used to determine per-namespace field support. * @match: receives the extracted flow match information. - * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * @nla_key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. The fields should of the packet that triggered the creation * of this flow. - * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink - * attribute specifies the mask field of the wildcarded flow. + * @nla_mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* + * Netlink attribute specifies the mask field of the wildcarded flow. * @log: Boolean to allow kernel error logging. Normally true, but when * probing for feature compatibility this should be passed in as false to * suppress unnecessary error logging. diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 2398d7238300..8c12675cbb67 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -29,6 +29,7 @@ #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/rculist.h> +#include <linux/sort.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> @@ -37,8 +38,8 @@ #define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) +#define MC_DEFAULT_HASH_ENTRIES 256 #define MC_HASH_SHIFT 8 -#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT) #define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) static struct kmem_cache *flow_cache; @@ -169,16 +170,70 @@ static struct table_instance *table_instance_alloc(int new_size) return ti; } +static void __mask_array_destroy(struct mask_array *ma) +{ + free_percpu(ma->masks_usage_cntr); + kfree(ma); +} + +static void mask_array_rcu_cb(struct rcu_head *rcu) +{ + struct mask_array *ma = container_of(rcu, struct mask_array, rcu); + + __mask_array_destroy(ma); +} + +static void tbl_mask_array_reset_counters(struct mask_array *ma) +{ + int i, cpu; + + /* As the per CPU counters are not atomic we can not go ahead and + * reset them from another CPU. To be able to still have an approximate + * zero based counter we store the value at reset, and subtract it + * later when processing. + */ + for (i = 0; i < ma->max; i++) { + ma->masks_usage_zero_cntr[i] = 0; + + for_each_possible_cpu(cpu) { + u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, + cpu); + unsigned int start; + u64 counter; + + do { + start = u64_stats_fetch_begin_irq(&ma->syncp); + counter = usage_counters[i]; + } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); + + ma->masks_usage_zero_cntr[i] += counter; + } + } +} + static struct mask_array *tbl_mask_array_alloc(int size) { struct mask_array *new; size = max(MASK_ARRAY_SIZE_MIN, size); new = kzalloc(sizeof(struct mask_array) + - sizeof(struct sw_flow_mask *) * size, GFP_KERNEL); + sizeof(struct sw_flow_mask *) * size + + sizeof(u64) * size, GFP_KERNEL); if (!new) return NULL; + new->masks_usage_zero_cntr = (u64 *)((u8 *)new + + sizeof(struct mask_array) + + sizeof(struct sw_flow_mask *) * + size); + + new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size, + __alignof__(u64)); + if (!new->masks_usage_cntr) { + kfree(new); + return NULL; + } + new->count = 0; new->max = size; @@ -202,10 +257,10 @@ static int tbl_mask_array_realloc(struct flow_table *tbl, int size) if (ovsl_dereference(old->masks[i])) new->masks[new->count++] = old->masks[i]; } + call_rcu(&old->rcu, mask_array_rcu_cb); } rcu_assign_pointer(tbl->mask_array, new); - kfree_rcu(old, rcu); return 0; } @@ -223,6 +278,11 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl, return err; ma = ovsl_dereference(tbl->mask_array); + } else { + /* On every add or delete we need to reset the counters so + * every new mask gets a fair chance of being prioritized. + */ + tbl_mask_array_reset_counters(ma); } BUG_ON(ovsl_dereference(ma->masks[ma_count])); @@ -260,6 +320,9 @@ found: if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) && ma_count <= (ma->max / 3)) tbl_mask_array_realloc(tbl, ma->max / 2); + else + tbl_mask_array_reset_counters(ma); + } /* Remove 'mask' from the mask list, if it is not needed any more. */ @@ -278,15 +341,79 @@ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) } } +static void __mask_cache_destroy(struct mask_cache *mc) +{ + free_percpu(mc->mask_cache); + kfree(mc); +} + +static void mask_cache_rcu_cb(struct rcu_head *rcu) +{ + struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu); + + __mask_cache_destroy(mc); +} + +static struct mask_cache *tbl_mask_cache_alloc(u32 size) +{ + struct mask_cache_entry __percpu *cache = NULL; + struct mask_cache *new; + + /* Only allow size to be 0, or a power of 2, and does not exceed + * percpu allocation size. + */ + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return NULL; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->cache_size = size; + if (new->cache_size > 0) { + cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry), + new->cache_size), + __alignof__(struct mask_cache_entry)); + if (!cache) { + kfree(new); + return NULL; + } + } + + new->mask_cache = cache; + return new; +} +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size) +{ + struct mask_cache *mc = rcu_dereference(table->mask_cache); + struct mask_cache *new; + + if (size == mc->cache_size) + return 0; + + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return -EINVAL; + + new = tbl_mask_cache_alloc(size); + if (!new) + return -ENOMEM; + + rcu_assign_pointer(table->mask_cache, new); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + + return 0; +} + int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; + struct mask_cache *mc; struct mask_array *ma; - table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * - MC_HASH_ENTRIES, - __alignof__(struct mask_cache_entry)); - if (!table->mask_cache) + mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES); + if (!mc) return -ENOMEM; ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); @@ -304,6 +431,7 @@ int ovs_flow_tbl_init(struct flow_table *table) rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); rcu_assign_pointer(table->mask_array, ma); + rcu_assign_pointer(table->mask_cache, mc); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; @@ -312,9 +440,9 @@ int ovs_flow_tbl_init(struct flow_table *table) free_ti: __table_instance_destroy(ti); free_mask_array: - kfree(ma); + __mask_array_destroy(ma); free_mask_cache: - free_percpu(table->mask_cache); + __mask_cache_destroy(mc); return -ENOMEM; } @@ -390,9 +518,11 @@ void ovs_flow_tbl_destroy(struct flow_table *table) { struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); + struct mask_cache *mc = rcu_dereference_raw(table->mask_cache); + struct mask_array *ma = rcu_dereference_raw(table->mask_array); - free_percpu(table->mask_cache); - kfree_rcu(rcu_dereference_raw(table->mask_array), rcu); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + call_rcu(&ma->rcu, mask_array_rcu_cb); table_instance_destroy(table, ti, ufid_ti, false); } @@ -604,8 +734,10 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, + u32 *n_cache_hit, u32 *index) { + u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr); struct sw_flow *flow; struct sw_flow_mask *mask; int i; @@ -614,8 +746,13 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, mask = rcu_dereference_ovsl(ma->masks[*index]); if (mask) { flow = masked_flow_lookup(ti, key, mask, n_mask_hit); - if (flow) + if (flow) { + u64_stats_update_begin(&ma->syncp); + usage_counters[*index]++; + u64_stats_update_end(&ma->syncp); + (*n_cache_hit)++; return flow; + } } } @@ -631,6 +768,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) { /* Found */ *index = i; + u64_stats_update_begin(&ma->syncp); + usage_counters[*index]++; + u64_stats_update_end(&ma->syncp); return flow; } } @@ -648,8 +788,10 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, - u32 *n_mask_hit) + u32 *n_mask_hit, + u32 *n_cache_hit) { + struct mask_cache *mc = rcu_dereference(tbl->mask_cache); struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); struct mask_cache_entry *entries, *ce; @@ -658,10 +800,13 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, int seg; *n_mask_hit = 0; - if (unlikely(!skb_hash)) { + *n_cache_hit = 0; + if (unlikely(!skb_hash || mc->cache_size == 0)) { u32 mask_index = 0; + u32 cache = 0; - return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); + return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache, + &mask_index); } /* Pre and post recirulation flows usually have the same skb_hash @@ -672,17 +817,17 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, ce = NULL; hash = skb_hash; - entries = this_cpu_ptr(tbl->mask_cache); + entries = this_cpu_ptr(mc->mask_cache); /* Find the cache entry 'ce' to operate on. */ for (seg = 0; seg < MC_HASH_SEGS; seg++) { - int index = hash & (MC_HASH_ENTRIES - 1); + int index = hash & (mc->cache_size - 1); struct mask_cache_entry *e; e = &entries[index]; if (e->skb_hash == skb_hash) { flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, - &e->mask_index); + n_cache_hit, &e->mask_index); if (!flow) e->skb_hash = 0; return flow; @@ -695,10 +840,12 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, } /* Cache miss, do full lookup. */ - flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, + &ce->mask_index); if (flow) ce->skb_hash = skb_hash; + *n_cache_hit = 0; return flow; } @@ -708,9 +855,10 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; + u32 __always_unused n_cache_hit; u32 index = 0; - return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); + return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, @@ -787,6 +935,13 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table) return READ_ONCE(ma->count); } +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table) +{ + struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); + + return READ_ONCE(mc->cache_size); +} + static struct table_instance *table_instance_expand(struct table_instance *ti, bool ufid) { @@ -934,6 +1089,98 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, return 0; } +static int compare_mask_and_count(const void *a, const void *b) +{ + const struct mask_count *mc_a = a; + const struct mask_count *mc_b = b; + + return (s64)mc_b->counter - (s64)mc_a->counter; +} + +/* Must be called with OVS mutex held. */ +void ovs_flow_masks_rebalance(struct flow_table *table) +{ + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); + struct mask_count *masks_and_count; + struct mask_array *new; + int masks_entries = 0; + int i; + + /* Build array of all current entries with use counters. */ + masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count), + GFP_KERNEL); + if (!masks_and_count) + return; + + for (i = 0; i < ma->max; i++) { + struct sw_flow_mask *mask; + unsigned int start; + int cpu; + + mask = rcu_dereference_ovsl(ma->masks[i]); + if (unlikely(!mask)) + break; + + masks_and_count[i].index = i; + masks_and_count[i].counter = 0; + + for_each_possible_cpu(cpu) { + u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, + cpu); + u64 counter; + + do { + start = u64_stats_fetch_begin_irq(&ma->syncp); + counter = usage_counters[i]; + } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); + + masks_and_count[i].counter += counter; + } + + /* Subtract the zero count value. */ + masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i]; + + /* Rather than calling tbl_mask_array_reset_counters() + * below when no change is needed, do it inline here. + */ + ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter; + } + + if (i == 0) + goto free_mask_entries; + + /* Sort the entries */ + masks_entries = i; + sort(masks_and_count, masks_entries, sizeof(*masks_and_count), + compare_mask_and_count, NULL); + + /* If the order is the same, nothing to do... */ + for (i = 0; i < masks_entries; i++) { + if (i != masks_and_count[i].index) + break; + } + if (i == masks_entries) + goto free_mask_entries; + + /* Rebuilt the new list in order of usage. */ + new = tbl_mask_array_alloc(ma->max); + if (!new) + goto free_mask_entries; + + for (i = 0; i < masks_entries; i++) { + int index = masks_and_count[i].index; + + if (ovsl_dereference(ma->masks[index])) + new->masks[new->count++] = ma->masks[index]; + } + + rcu_assign_pointer(table->mask_array, new); + call_rcu(&ma->rcu, mask_array_rcu_cb); + +free_mask_entries: + kfree(masks_and_count); +} + /* Initializes the flow module. * Returns zero if successful or a negative error code. */ int ovs_flow_init(void) diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 8a5cea6ae111..74ce48fecba9 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -27,9 +27,23 @@ struct mask_cache_entry { u32 mask_index; }; +struct mask_cache { + struct rcu_head rcu; + u32 cache_size; /* Must be ^2 value. */ + struct mask_cache_entry __percpu *mask_cache; +}; + +struct mask_count { + int index; + u64 counter; +}; + struct mask_array { struct rcu_head rcu; int count, max; + u64 __percpu *masks_usage_cntr; + u64 *masks_usage_zero_cntr; + struct u64_stats_sync syncp; struct sw_flow_mask __rcu *masks[]; }; @@ -45,7 +59,7 @@ struct table_instance { struct flow_table { struct table_instance __rcu *ti; struct table_instance __rcu *ufid_ti; - struct mask_cache_entry __percpu *mask_cache; + struct mask_cache __rcu *mask_cache; struct mask_array __rcu *mask_array; unsigned long last_rehash; unsigned int count; @@ -69,12 +83,15 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, const struct sw_flow_mask *mask); void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); int ovs_flow_tbl_num_masks(const struct flow_table *table); +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table); +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, const struct sw_flow_key *, u32 skb_hash, - u32 *n_mask_hit); + u32 *n_mask_hit, + u32 *n_cache_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, @@ -86,4 +103,7 @@ bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, bool full, const struct sw_flow_mask *mask); + +void ovs_flow_masks_rebalance(struct flow_table *table); + #endif /* flow_table.h */ diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 47febb4504f0..0d44c5c013fa 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -87,6 +87,7 @@ EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); /** * ovs_vport_locate - find a port that has already been created * + * @net: network namespace * @name: name of port to find * * Must be called with ovs or RCU read lock. @@ -418,7 +419,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * * @vport: vport that received the packet * @skb: skb that was received - * @tun_key: tunnel (if any) that carried packet + * @tun_info: tunnel (if any) that carried packet * * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 29bd405adbbd..0b8160d1a6e0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -593,6 +593,7 @@ static void init_prb_bdqc(struct packet_sock *po, req_u->req3.tp_block_size); p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; + rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); @@ -659,10 +660,9 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) * */ if (BLOCK_NUM_PKTS(pbd)) { - while (atomic_read(&pkc->blk_fill_in_prog)) { - /* Waiting for skb_copy_bits to finish... */ - cpu_relax(); - } + /* Waiting for skb_copy_bits to finish... */ + write_lock(&pkc->blk_fill_in_prog_lock); + write_unlock(&pkc->blk_fill_in_prog_lock); } if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { @@ -921,10 +921,9 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, * the timer-handler already handled this case. */ if (!(status & TP_STATUS_BLK_TMO)) { - while (atomic_read(&pkc->blk_fill_in_prog)) { - /* Waiting for skb_copy_bits to finish... */ - cpu_relax(); - } + /* Waiting for skb_copy_bits to finish... */ + write_lock(&pkc->blk_fill_in_prog_lock); + write_unlock(&pkc->blk_fill_in_prog_lock); } prb_close_block(pkc, pbd, po, status); return; @@ -944,7 +943,8 @@ static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); - atomic_dec(&pkc->blk_fill_in_prog); + + read_unlock(&pkc->blk_fill_in_prog_lock); } static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, @@ -998,7 +998,7 @@ static void prb_fill_curr_block(char *curr, pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_NUM_PKTS(pbd) += 1; - atomic_inc(&pkc->blk_fill_in_prog); + read_lock(&pkc->blk_fill_in_prog_lock); prb_run_all_ft_ops(pkc, ppd); } @@ -1536,7 +1536,7 @@ static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) } } -static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, +static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; @@ -1545,10 +1545,10 @@ static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; - if (len != sizeof(fprog)) - return -EINVAL; - if (copy_from_user(&fprog, data, len)) - return -EFAULT; + + ret = copy_bpf_fprog_from_user(&fprog, data, len); + if (ret) + return ret; ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) @@ -1558,7 +1558,7 @@ static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, return 0; } -static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, +static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; @@ -1568,7 +1568,7 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, return -EPERM; if (len != sizeof(fd)) return -EINVAL; - if (copy_from_user(&fd, data, len)) + if (copy_from_sockptr(&fd, data, len)) return -EFAULT; new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); @@ -1579,7 +1579,7 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, return 0; } -static int fanout_set_data(struct packet_sock *po, char __user *data, +static int fanout_set_data(struct packet_sock *po, sockptr_t data, unsigned int len) { switch (po->fanout->type) { @@ -3652,7 +3652,8 @@ static void packet_flush_mclist(struct sock *sk) } static int -packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, + unsigned int optlen) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); @@ -3672,7 +3673,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return -EINVAL; if (len > sizeof(mreq)) len = sizeof(mreq); - if (copy_from_user(&mreq, optval, len)) + if (copy_from_sockptr(&mreq, optval, len)) return -EFAULT; if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) return -EINVAL; @@ -3703,7 +3704,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen < len) { ret = -EINVAL; } else { - if (copy_from_user(&req_u.req, optval, len)) + if (copy_from_sockptr(&req_u.req, optval, len)) ret = -EFAULT; else ret = packet_set_ring(sk, &req_u, 0, @@ -3718,7 +3719,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; pkt_sk(sk)->copy_thresh = val; @@ -3730,7 +3731,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: @@ -3756,7 +3757,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; @@ -3776,7 +3777,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3795,7 +3796,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3809,7 +3810,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3825,7 +3826,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return -EINVAL; if (optlen < sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3844,7 +3845,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; po->tp_tstamp = val; @@ -3856,7 +3857,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; return fanout_add(sk, val & 0xffff, val >> 16); @@ -3874,7 +3875,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val < 0 || val > 1) return -EINVAL; @@ -3888,7 +3889,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); @@ -3907,7 +3908,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; po->xmit = val ? packet_direct_xmit : dev_queue_xmit; @@ -4040,28 +4041,6 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, return 0; } - -#ifdef CONFIG_COMPAT -static int compat_packet_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - struct packet_sock *po = pkt_sk(sock->sk); - - if (level != SOL_PACKET) - return -ENOPROTOOPT; - - if (optname == PACKET_FANOUT_DATA && - po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) { - optval = (char __user *)get_compat_bpf_fprog(optval); - if (!optval) - return -EFAULT; - optlen = sizeof(struct sock_fprog); - } - - return packet_setsockopt(sock, level, optname, optval, optlen); -} -#endif - static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { @@ -4293,7 +4272,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; __be16 num; - int err = -EINVAL; + int err; /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; @@ -4525,8 +4504,6 @@ static const struct proto_ops packet_ops_spkt = { .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, @@ -4549,9 +4526,6 @@ static const struct proto_ops packet_ops = { .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_packet_setsockopt, -#endif .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, diff --git a/net/packet/internal.h b/net/packet/internal.h index 907f4cd2a718..fd41ecb7f605 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -39,7 +39,7 @@ struct tpacket_kbdq_core { char *nxt_offset; struct sk_buff *skb; - atomic_t blk_fill_in_prog; + rwlock_t blk_fill_in_prog_lock; /* Default is set to 8ms */ #define DEFAULT_PRB_RETIRE_TOV (8) diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 4577e43cb777..e47d09aca4af 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -975,7 +975,7 @@ static int pep_init(struct sock *sk) } static int pep_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct pep_sock *pn = pep_sk(sk); int val = 0, err = 0; @@ -983,7 +983,7 @@ static int pep_setsockopt(struct sock *sk, int level, int optname, if (level != SOL_PNPIPE) return -ENOPROTOOPT; if (optlen >= sizeof(int)) { - if (get_user(val, (int __user *) optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 76d499f6af9a..2599235d592e 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -439,12 +439,6 @@ const struct proto_ops phonet_dgram_ops = { .ioctl = pn_socket_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, -#ifdef CONFIG_COMPAT - .compat_setsockopt = sock_no_setsockopt, - .compat_getsockopt = sock_no_getsockopt, -#endif .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, @@ -466,10 +460,6 @@ const struct proto_ops phonet_stream_ops = { .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 300a104b9a0f..b4c0db0b7d31 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1209,8 +1209,6 @@ static const struct proto_ops qrtr_proto_ops = { .gettstamp = sock_gettstamp, .poll = datagram_poll, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .release = qrtr_release, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 1a5bf3fa4578..b239120dd9ca 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -290,8 +290,7 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return 0; } -static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, - int len) +static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len) { struct sockaddr_in6 sin6; struct sockaddr_in sin; @@ -308,14 +307,15 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, goto out; } else if (len < sizeof(struct sockaddr_in6)) { /* Assume IPv4 */ - if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) { + if (copy_from_sockptr(&sin, optval, + sizeof(struct sockaddr_in))) { ret = -EFAULT; goto out; } ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); sin6.sin6_port = sin.sin_port; } else { - if (copy_from_user(&sin6, optval, + if (copy_from_sockptr(&sin6, optval, sizeof(struct sockaddr_in6))) { ret = -EFAULT; goto out; @@ -327,21 +327,20 @@ out: return ret; } -static int rds_set_bool_option(unsigned char *optvar, char __user *optval, +static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval, int optlen) { int value; if (optlen < sizeof(int)) return -EINVAL; - if (get_user(value, (int __user *) optval)) + if (copy_from_sockptr(&value, optval, sizeof(int))) return -EFAULT; *optvar = !!value; return 0; } -static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, - int optlen) +static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen) { int ret; @@ -358,8 +357,7 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, return ret; } -static int rds_set_transport(struct rds_sock *rs, char __user *optval, - int optlen) +static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen) { int t_type; @@ -369,7 +367,7 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval, if (optlen != sizeof(int)) return -EINVAL; - if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type))) + if (copy_from_sockptr(&t_type, optval, sizeof(t_type))) return -EFAULT; if (t_type < 0 || t_type >= RDS_TRANS_COUNT) @@ -380,7 +378,7 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval, return rs->rs_transport ? 0 : -ENOPROTOOPT; } -static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, +static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval, int optlen, int optname) { int val, valbool; @@ -388,7 +386,7 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, if (optlen != sizeof(int)) return -EFAULT; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; valbool = val ? 1 : 0; @@ -404,7 +402,7 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, return 0; } -static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, +static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_rx_trace_so trace; @@ -413,7 +411,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, if (optlen != sizeof(struct rds_rx_trace_so)) return -EFAULT; - if (copy_from_user(&trace, optval, sizeof(trace))) + if (copy_from_sockptr(&trace, optval, sizeof(trace))) return -EFAULT; if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) @@ -432,7 +430,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, } static int rds_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); int ret; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index a7ae11846cd7..ccdd304eae0a 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -353,21 +353,20 @@ out: return ret; } -int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen) +int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_args args; if (optlen != sizeof(struct rds_get_mr_args)) return -EINVAL; - if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval, - sizeof(struct rds_get_mr_args))) + if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args))) return -EFAULT; return __rds_rdma_map(rs, &args, NULL, NULL, NULL); } -int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) +int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_for_dest_args args; struct rds_get_mr_args new_args; @@ -375,7 +374,7 @@ int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) if (optlen != sizeof(struct rds_get_mr_for_dest_args)) return -EINVAL; - if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval, + if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_for_dest_args))) return -EFAULT; @@ -394,7 +393,7 @@ int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) /* * Free the MR indicated by the given R_Key */ -int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) +int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_free_mr_args args; struct rds_mr *mr; @@ -403,8 +402,7 @@ int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) if (optlen != sizeof(struct rds_free_mr_args)) return -EINVAL; - if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval, - sizeof(struct rds_free_mr_args))) + if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args))) return -EFAULT; /* Special case - a null cookie means flush all unused MRs */ diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index bfafd4a6d827..ca4c3a667091 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -13,7 +13,7 @@ /* Below reject reason is for legacy interoperability issue with non-linux * RDS endpoints where older version incompatibility is conveyed via value 1. - * For future version(s), proper encoded reject reason should be be used. + * For future version(s), proper encoded reject reason should be used. */ #define RDS_RDMA_REJ_INCOMPAT 1 diff --git a/net/rds/rds.h b/net/rds/rds.h index 106e862996b9..d35d1fc39807 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -924,9 +924,9 @@ int rds_send_pong(struct rds_conn_path *cp, __be16 dport); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); -int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); -int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); -int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); +int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen); +int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen); +int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen); void rds_rdma_drop_keys(struct rds_sock *rs); int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov); diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ce85656ac9c1..cf7d974e0f61 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -365,7 +365,7 @@ void rose_destroy_socket(struct sock *sk) */ static int rose_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); @@ -377,7 +377,7 @@ static int rose_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(opt, (int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(int))) return -EFAULT; switch (optname) { diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 394189b81849..e6725a6de015 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -267,7 +267,7 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @gfp: The allocation constraints * @notify_rx: Where to send notifications instead of socket queue * @upgrade: Request service upgrade for call - * @intr: The call is interruptible + * @interruptibility: The call is interruptible, or can be canceled. * @debug_id: The debug ID for tracing to be assigned to the call * * Allow a kernel service to begin a call on the nominated socket. This just @@ -588,7 +588,7 @@ EXPORT_SYMBOL(rxrpc_sock_set_min_security_level); * set RxRPC socket options */ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); unsigned int min_sec_level; @@ -639,8 +639,8 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; - ret = get_user(min_sec_level, - (unsigned int __user *) optval); + ret = copy_from_sockptr(&min_sec_level, optval, + sizeof(unsigned int)); if (ret < 0) goto error; ret = -EINVAL; @@ -658,7 +658,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, if (rx->sk.sk_state != RXRPC_SERVER_BOUND2) goto error; ret = -EFAULT; - if (copy_from_user(service_upgrade, optval, + if (copy_from_sockptr(service_upgrade, optval, sizeof(service_upgrade)) != 0) goto error; ret = -EINVAL; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9a2139ebd67d..6d29a3603a3e 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -909,8 +909,8 @@ extern const struct rxrpc_security rxrpc_no_security; extern struct key_type key_type_rxrpc; extern struct key_type key_type_rxrpc_s; -int rxrpc_request_key(struct rxrpc_sock *, char __user *, int); -int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int); +int rxrpc_request_key(struct rxrpc_sock *, sockptr_t , int); +int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t, u32); diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 0c98313dd7a8..94c3df392651 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -896,7 +896,7 @@ static void rxrpc_describe(const struct key *key, struct seq_file *m) /* * grab the security key for a socket */ -int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) +int rxrpc_request_key(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; @@ -906,7 +906,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; - description = memdup_user_nul(optval, optlen); + description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); @@ -926,8 +926,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) /* * grab the security keyring for a server socket */ -int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, - int optlen) +int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; @@ -937,7 +936,7 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; - description = memdup_user_nul(optval, optlen); + description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 84badf00647e..a3b37d88800e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -468,6 +468,9 @@ choice config DEFAULT_FQ_CODEL bool "Fair Queue Controlled Delay" if NET_SCH_FQ_CODEL + config DEFAULT_FQ_PIE + bool "Flow Queue Proportional Integral controller Enhanced" if NET_SCH_FQ_PIE + config DEFAULT_SFQ bool "Stochastic Fair Queue" if NET_SCH_SFQ @@ -480,6 +483,7 @@ config DEFAULT_NET_SCH default "pfifo_fast" if DEFAULT_PFIFO_FAST default "fq" if DEFAULT_FQ default "fq_codel" if DEFAULT_FQ_CODEL + default "fq_pie" if DEFAULT_FQ_PIE default "sfq" if DEFAULT_SFQ default "pfifo_fast" endif diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 8ac7eb0a8309..063d8aaf2900 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1059,14 +1059,13 @@ err: return err; } -void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, - bool drop, bool hw) +void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, bool hw) { if (a->cpu_bstats) { _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - if (drop) - this_cpu_ptr(a->cpu_qstats)->drops += packets; + this_cpu_ptr(a->cpu_qstats)->drops += drops; if (hw) _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), @@ -1075,8 +1074,7 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, } _bstats_update(&a->tcfa_bstats, bytes, packets); - if (drop) - a->tcfa_qstats.drops += packets; + a->tcfa_qstats.drops += drops; if (hw) _bstats_update(&a->tcfa_bstats_hw, bytes, packets); } @@ -1475,7 +1473,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ROOT_MAX + 1]; - u32 portid = skb ? NETLINK_CB(skb).portid : 0; + u32 portid = NETLINK_CB(skb).portid; int ret = 0, ovr = 0; if ((n->nlmsg_type != RTM_GETACTION) && diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index c60674cf25c4..f5826e457679 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -598,7 +598,8 @@ again: if (!tcf_csum_ipv6(skb, update_flags)) goto drop; break; - case cpu_to_be16(ETH_P_8021AD): /* fall through */ + case cpu_to_be16(ETH_P_8021AD): + fallthrough; case cpu_to_be16(ETH_P_8021Q): if (skb_vlan_tag_present(skb) && !orig_vlan_tag_present) { protocol = skb->protocol; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 6ed1652d1e26..e6ad42b11835 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -706,8 +706,10 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, if (err && err != -EINPROGRESS) goto out_free; - if (!err) + if (!err) { *defrag = true; + cb.mru = IPCB(skb)->frag_max_size; + } } else { /* NFPROTO_IPV6 */ #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; @@ -717,8 +719,10 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, if (err && err != -EINPROGRESS) goto out_free; - if (!err) + if (!err) { *defrag = true; + cb.mru = IP6CB(skb)->frag_max_size; + } #else err = -EOPNOTSUPP; goto out_free; @@ -792,7 +796,7 @@ static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, } } /* Non-ICMP, fall thru to initialize if needed. */ - /* fall through */ + fallthrough; case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. @@ -1464,12 +1468,12 @@ static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index) return tcf_idr_search(tn, a, index); } -static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_ct *c = to_ct(a); - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse); } diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 416065772719..410e3bbfb9ca 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -171,14 +171,15 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, return action; } -static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; - tcf_action_update_stats(a, bytes, packets, action == TC_ACT_SHOT, hw); + tcf_action_update_stats(a, bytes, packets, + action == TC_ACT_SHOT ? packets : drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 323ae7f6315d..1fb8d428d2c1 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -578,13 +578,13 @@ static int tcf_gate_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_gate *gact = to_gate(a); struct tcf_t *tm = &gact->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 83dd82fc9f40..b2705318993b 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -312,13 +312,13 @@ out: return retval; } -static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index d41d6200d9de..c158bfed86d5 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -409,13 +409,13 @@ done: return p->tcf_action; } -static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_pedit *d = to_pedit(a); struct tcf_t *tm = &d->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } @@ -436,8 +436,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, return -ENOBUFS; spin_lock_bh(&p->tcf_lock); - memcpy(opt->keys, p->tcfp_keys, - p->tcfp_nkeys * sizeof(struct tc_pedit_key)); + memcpy(opt->keys, p->tcfp_keys, flex_array_size(opt, keys, p->tcfp_nkeys)); opt->index = p->tcf_index; opt->nkeys = p->tcfp_nkeys; opt->flags = p->tcfp_flags; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 8b7a0ac96c51..0b431d493768 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -288,13 +288,13 @@ static void tcf_police_cleanup(struct tc_action *a) } static void tcf_police_stats_update(struct tc_action *a, - u64 bytes, u32 packets, + u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_police *police = to_police(a); struct tcf_t *tm = &police->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index b2b3faa57294..d0652386c6e2 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -74,12 +74,13 @@ err: } static void tcf_skbedit_stats_update(struct tc_action *a, u64 bytes, - u32 packets, u64 lastuse, bool hw) + u64 packets, u64 drops, + u64 lastuse, bool hw) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_t *tm = &d->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index c91d3958fcbb..a5ff9f68ab02 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -302,13 +302,13 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops, extack); } -static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u32 packets, - u64 lastuse, bool hw) +static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) { struct tcf_vlan *v = to_vlan(a); struct tcf_t *tm = &v->tcf_tm; - tcf_action_update_stats(a, bytes, packets, false, hw); + tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 4619cb3cb0a8..41a55c6cbeb8 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -621,7 +621,7 @@ static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo); static void tcf_block_offload_init(struct flow_block_offload *bo, - struct net_device *dev, + struct net_device *dev, struct Qdisc *sch, enum flow_block_command command, enum flow_block_binder_type binder_type, struct flow_block *flow_block, @@ -633,6 +633,7 @@ static void tcf_block_offload_init(struct flow_block_offload *bo, bo->block = flow_block; bo->block_shared = shared; bo->extack = extack; + bo->sch = sch; INIT_LIST_HEAD(&bo->cb_list); } @@ -643,10 +644,11 @@ static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) { struct tcf_block *block = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; + struct Qdisc *sch = block_cb->indr.sch; struct netlink_ext_ack extack = {}; struct flow_block_offload bo; - tcf_block_offload_init(&bo, dev, FLOW_BLOCK_UNBIND, + tcf_block_offload_init(&bo, dev, sch, FLOW_BLOCK_UNBIND, block_cb->indr.binder_type, &block->flow_block, tcf_block_shared(block), &extack); @@ -665,14 +667,14 @@ static bool tcf_block_offload_in_use(struct tcf_block *block) } static int tcf_block_offload_cmd(struct tcf_block *block, - struct net_device *dev, + struct net_device *dev, struct Qdisc *sch, struct tcf_block_ext_info *ei, enum flow_block_command command, struct netlink_ext_ack *extack) { struct flow_block_offload bo = {}; - tcf_block_offload_init(&bo, dev, command, ei->binder_type, + tcf_block_offload_init(&bo, dev, sch, command, ei->binder_type, &block->flow_block, tcf_block_shared(block), extack); @@ -689,7 +691,7 @@ static int tcf_block_offload_cmd(struct tcf_block *block, return tcf_block_setup(block, &bo); } - flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, &bo, + flow_indr_dev_setup_offload(dev, sch, TC_SETUP_BLOCK, block, &bo, tc_block_indr_cleanup); tcf_block_setup(block, &bo); @@ -716,7 +718,7 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, goto err_unlock; } - err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack); + err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; if (err) @@ -743,7 +745,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, int err; down_write(&block->cb_lock); - err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); + err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; up_write(&block->cb_lock); @@ -1627,6 +1629,7 @@ int tcf_classify_ingress(struct sk_buff *skb, if (WARN_ON_ONCE(!ext)) return TC_ACT_SHOT; ext->chain = last_executed_chain; + ext->mru = qdisc_skb_cb(skb)->mru; } return ret; @@ -3659,9 +3662,11 @@ int tc_setup_flow_action(struct flow_action *flow_action, tcf_sample_get_group(entry, act); } else if (is_tcf_police(act)) { entry->id = FLOW_ACTION_POLICE; - entry->police.burst = tcf_police_tcfp_burst(act); + entry->police.burst = tcf_police_burst(act); entry->police.rate_bytes_ps = tcf_police_rate_bytes_ps(act); + entry->police.mtu = tcf_police_tcfp_mtu(act); + entry->police.index = act->tcfa_index; } else if (is_tcf_ct(act)) { entry->id = FLOW_ACTION_CT; entry->ct.action = tcf_ct_action(act); @@ -3745,6 +3750,119 @@ unsigned int tcf_exts_num_actions(struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_num_actions); +#ifdef CONFIG_NET_CLS_ACT +static int tcf_qevent_parse_block_index(struct nlattr *block_index_attr, + u32 *p_block_index, + struct netlink_ext_ack *extack) +{ + *p_block_index = nla_get_u32(block_index_attr); + if (!*p_block_index) { + NL_SET_ERR_MSG(extack, "Block number may not be zero"); + return -EINVAL; + } + + return 0; +} + +int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, + enum flow_block_binder_type binder_type, + struct nlattr *block_index_attr, + struct netlink_ext_ack *extack) +{ + u32 block_index; + int err; + + if (!block_index_attr) + return 0; + + err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack); + if (err) + return err; + + if (!block_index) + return 0; + + qe->info.binder_type = binder_type; + qe->info.chain_head_change = tcf_chain_head_change_dflt; + qe->info.chain_head_change_priv = &qe->filter_chain; + qe->info.block_index = block_index; + + return tcf_block_get_ext(&qe->block, sch, &qe->info, extack); +} +EXPORT_SYMBOL(tcf_qevent_init); + +void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) +{ + if (qe->info.block_index) + tcf_block_put_ext(qe->block, sch, &qe->info); +} +EXPORT_SYMBOL(tcf_qevent_destroy); + +int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, + struct netlink_ext_ack *extack) +{ + u32 block_index; + int err; + + if (!block_index_attr) + return 0; + + err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack); + if (err) + return err; + + /* Bounce newly-configured block or change in block. */ + if (block_index != qe->info.block_index) { + NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL(tcf_qevent_validate_change); + +struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, + struct sk_buff **to_free, int *ret) +{ + struct tcf_result cl_res; + struct tcf_proto *fl; + + if (!qe->info.block_index) + return skb; + + fl = rcu_dereference_bh(qe->filter_chain); + + switch (tcf_classify(skb, fl, &cl_res, false)) { + case TC_ACT_SHOT: + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + *ret = __NET_XMIT_BYPASS; + return NULL; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + __qdisc_drop(skb, to_free); + *ret = __NET_XMIT_STOLEN; + return NULL; + case TC_ACT_REDIRECT: + skb_do_redirect(skb); + *ret = __NET_XMIT_STOLEN; + return NULL; + } + + return skb; +} +EXPORT_SYMBOL(tcf_qevent_handle); + +int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) +{ + if (!qe->info.block_index) + return 0; + return nla_put_u32(skb, attr_name, qe->info.block_index); +} +EXPORT_SYMBOL(tcf_qevent_dump); +#endif + static __net_init int tcf_net_init(struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e30bd969fc48..a4f7ef1de7e7 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -64,6 +64,7 @@ struct fl_flow_key { }; } tp_range; struct flow_dissector_key_ct ct; + struct flow_dissector_key_hash hash; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -318,6 +319,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, ARRAY_SIZE(fl_ct_info_to_flower_map)); + skb_flow_dissect_hash(skb, &mask->dissector, &skb_key); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); f = fl_mask_lookup(mask, &skb_key); @@ -491,6 +493,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f, tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes, cls_flower.stats.pkts, + cls_flower.stats.drops, cls_flower.stats.lastused, cls_flower.stats.used_hw_stats, cls_flower.stats.used_hw_stats_valid); @@ -694,6 +697,9 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY, .len = 128 / BITS_PER_BYTE }, [TCA_FLOWER_FLAGS] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_HASH] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_HASH_MASK] = { .type = NLA_U32 }, + }; static const struct nla_policy @@ -1625,6 +1631,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip); + fl_set_key_val(tb, &key->hash.hash, TCA_FLOWER_KEY_HASH, + &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK, + sizeof(key->hash.hash)); + if (tb[TCA_FLOWER_KEY_ENC_OPTS]) { ret = fl_set_enc_opt(tb, key, mask, extack); if (ret) @@ -1739,6 +1749,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_CT, ct); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_HASH, hash); skb_flow_dissector_init(dissector, keys, cnt); } @@ -2959,6 +2971,11 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; + if (fl_dump_key_val(skb, &key->hash.hash, TCA_FLOWER_KEY_HASH, + &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK, + sizeof(key->hash.hash))) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 8d39dbcf1746..cafb84480bab 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -338,7 +338,8 @@ static void mall_stats_hw_filter(struct tcf_proto *tp, tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true); tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes, - cls_mall.stats.pkts, cls_mall.stats.lastused, + cls_mall.stats.pkts, cls_mall.stats.drops, + cls_mall.stats.lastused, cls_mall.stats.used_hw_stats, cls_mall.stats.used_hw_stats_valid); } diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 61e95029c18f..78bec347b8b6 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -533,7 +533,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," "p %p,r %p,*arg %p\n", - tp, handle, tca, arg, opt, p, r, arg ? *arg : NULL); + tp, handle, tca, arg, opt, p, r, *arg); if (!opt) return 0; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index e15ff335953d..7b69ab1993ba 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -796,9 +796,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, struct tc_u32_sel *s = &n->sel; struct tc_u_knode *new; - new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), - GFP_KERNEL); - + new = kzalloc(struct_size(new, sel.keys, s->nkeys), GFP_KERNEL); if (!new) return NULL; @@ -854,9 +852,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, u32 htid, flags = 0; size_t sel_size; int err; -#ifdef CONFIG_CLS_U32_PERF - size_t size; -#endif if (!opt) { if (handle) { @@ -1024,15 +1019,15 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, goto erridr; } - n = kzalloc(offsetof(typeof(*n), sel) + sel_size, GFP_KERNEL); + n = kzalloc(struct_size(n, sel.keys, s->nkeys), GFP_KERNEL); if (n == NULL) { err = -ENOBUFS; goto erridr; } #ifdef CONFIG_CLS_U32_PERF - size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64); - n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt)); + n->pf = __alloc_percpu(struct_size(n->pf, kcnts, s->nkeys), + __alignof__(struct tc_u32_pcnt)); if (!n->pf) { err = -ENOBUFS; goto errfree; @@ -1296,8 +1291,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, int cpu; #endif - if (nla_put(skb, TCA_U32_SEL, - sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), + if (nla_put(skb, TCA_U32_SEL, struct_size(&n->sel, keys, n->sel.nkeys), &n->sel)) goto nla_put_failure; @@ -1347,9 +1341,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, goto nla_put_failure; } #ifdef CONFIG_CLS_U32_PERF - gpf = kzalloc(sizeof(struct tc_u32_pcnt) + - n->sel.nkeys * sizeof(u64), - GFP_KERNEL); + gpf = kzalloc(struct_size(gpf, kcnts, n->sel.nkeys), GFP_KERNEL); if (!gpf) goto nla_put_failure; @@ -1363,9 +1355,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, gpf->kcnts[i] += pf->kcnts[i]; } - if (nla_put_64bit(skb, TCA_U32_PCNT, - sizeof(struct tc_u32_pcnt) + - n->sel.nkeys * sizeof(u64), + if (nla_put_64bit(skb, TCA_U32_PCNT, struct_size(gpf, kcnts, n->sel.nkeys), gpf, TCA_U32_PAD)) { kfree(gpf); goto nla_put_failure; diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c index b9a94fdf9397..5ea84decec19 100644 --- a/net/sched/em_canid.c +++ b/net/sched/em_canid.c @@ -40,6 +40,7 @@ struct canid_match { /** * em_canid_get_id() - Extracts Can ID out of the sk_buff structure. + * @skb: buffer to extract Can ID from */ static canid_t em_canid_get_id(struct sk_buff *skb) { diff --git a/net/sched/ematch.c b/net/sched/ematch.c index dd3b8c11a2e0..f885bea5b452 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -389,7 +389,6 @@ EXPORT_SYMBOL(tcf_em_tree_validate); /** * tcf_em_tree_destroy - destroy an ematch tree * - * @tp: classifier kind handle * @tree: ematch tree to be deleted * * This functions destroys an ematch tree previously created by @@ -425,7 +424,7 @@ EXPORT_SYMBOL(tcf_em_tree_destroy); * tcf_em_tree_dump - dump ematch tree into a rtnl message * * @skb: skb holding the rtnl message - * @t: ematch tree to be dumped + * @tree: ematch tree to be dumped * @tlv: TLV type to be used to encapsulate the tree * * This function dumps a ematch tree into a rtnl message. It is valid to diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 9a3449b56bd6..2a76a2f5ed88 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -267,7 +267,8 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) root->handle == handle) return root; - hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) { + hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle, + lockdep_rtnl_is_held()) { if (q->handle == handle) return q; } @@ -1093,8 +1094,7 @@ skip: int err; /* Only support running class lockless if parent is lockless */ - if (new && (new->flags & TCQ_F_NOLOCK) && - parent && !(parent->flags & TCQ_F_NOLOCK)) + if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK)) qdisc_clear_nolock(new); if (!cops || !cops->graft) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index c7b587897585..0618b63f87c4 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -312,8 +312,8 @@ static const u8 precedence[] = { }; static const u8 diffserv8[] = { - 2, 5, 1, 2, 4, 2, 2, 2, - 0, 2, 1, 2, 1, 2, 1, 2, + 2, 0, 1, 2, 4, 2, 2, 2, + 1, 2, 1, 2, 1, 2, 1, 2, 5, 2, 4, 2, 4, 2, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 6, 2, 3, 2, 3, 2, 3, 2, @@ -323,7 +323,7 @@ static const u8 diffserv8[] = { }; static const u8 diffserv4[] = { - 0, 2, 0, 0, 2, 0, 0, 0, + 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, @@ -334,7 +334,7 @@ static const u8 diffserv4[] = { }; static const u8 diffserv3[] = { - 0, 0, 0, 0, 2, 0, 0, 0, + 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index ce4519358106..53d45e029c36 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -250,7 +250,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; case TC_ACT_RECLASSIFY: diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 07a2b0b35495..dde564670ad8 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -324,7 +324,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index a87e9159338c..c1e84d1eeaba 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -397,7 +397,7 @@ static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 985d5208f563..bbd5f8753600 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -99,7 +99,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return 0; } diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 4d307f17e084..4dda15588cf4 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -102,7 +102,7 @@ static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return 0; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 92ad4115e473..d1902fca9844 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1136,7 +1136,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 6feab225b4ba..cd70dbcbd72f 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -239,7 +239,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 1330ad224931..5c27b4270b90 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -43,7 +43,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 647941702f9f..3eabb871a1d5 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -46,7 +46,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 0b05ac7c848e..6335230a971e 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -699,7 +699,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return NULL; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 555a1b9e467f..deac82f3ad7b 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -46,6 +46,8 @@ struct red_sched_data { struct red_vars vars; struct red_stats stats; struct Qdisc *qdisc; + struct tcf_qevent qe_early_drop; + struct tcf_qevent qe_mark; }; #define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP) @@ -92,6 +94,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (INET_ECN_set_ce(skb)) { q->stats.prob_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.prob_drop++; goto congestion_drop; @@ -109,6 +114,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (INET_ECN_set_ce(skb)) { q->stats.forced_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.forced_drop++; goto congestion_drop; @@ -129,6 +137,10 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, return ret; congestion_drop: + skb = tcf_qevent_handle(&q->qe_early_drop, sch, skb, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } @@ -202,6 +214,8 @@ static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); + tcf_qevent_destroy(&q->qe_mark, sch); + tcf_qevent_destroy(&q->qe_early_drop, sch); del_timer_sync(&q->adapt_timer); red_offload(sch, false); qdisc_put(q->qdisc); @@ -213,14 +227,15 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS), + [TCA_RED_EARLY_DROP_BLOCK] = { .type = NLA_U32 }, + [TCA_RED_MARK_BLOCK] = { .type = NLA_U32 }, }; -static int red_change(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +static int __red_change(struct Qdisc *sch, struct nlattr **tb, + struct netlink_ext_ack *extack) { struct Qdisc *old_child = NULL, *child = NULL; struct red_sched_data *q = qdisc_priv(sch); - struct nlattr *tb[TCA_RED_MAX + 1]; struct nla_bitfield32 flags_bf; struct tc_red_qopt *ctl; unsigned char userbits; @@ -228,14 +243,6 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, int err; u32 max_P; - if (opt == NULL) - return -EINVAL; - - err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy, - NULL); - if (err < 0) - return err; - if (tb[TCA_RED_PARMS] == NULL || tb[TCA_RED_STAB] == NULL) return -EINVAL; @@ -323,11 +330,74 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct red_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_RED_MAX + 1]; + int err; q->qdisc = &noop_qdisc; q->sch = sch; timer_setup(&q->adapt_timer, red_adaptative_timer, 0); - return red_change(sch, opt, extack); + + if (!opt) + return -EINVAL; + + err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy, + extack); + if (err < 0) + return err; + + err = __red_change(sch, tb, extack); + if (err) + return err; + + err = tcf_qevent_init(&q->qe_early_drop, sch, + FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + goto err_early_drop_init; + + err = tcf_qevent_init(&q->qe_mark, sch, + FLOW_BLOCK_BINDER_TYPE_RED_MARK, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + goto err_mark_init; + + return 0; + +err_mark_init: + tcf_qevent_destroy(&q->qe_early_drop, sch); +err_early_drop_init: + del_timer_sync(&q->adapt_timer); + red_offload(sch, false); + qdisc_put(q->qdisc); + return err; +} + +static int red_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_RED_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy, + extack); + if (err < 0) + return err; + + err = tcf_qevent_validate_change(&q->qe_early_drop, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + return err; + + err = tcf_qevent_validate_change(&q->qe_mark, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + return err; + + return __red_change(sch, tb, extack); } static int red_dump_offload_stats(struct Qdisc *sch) @@ -371,7 +441,9 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || nla_put_bitfield32(skb, TCA_RED_FLAGS, - q->flags, TC_RED_SUPPORTED_FLAGS)) + q->flags, TC_RED_SUPPORTED_FLAGS) || + tcf_qevent_dump(skb, TCA_RED_MARK_BLOCK, &q->qe_mark) || + tcf_qevent_dump(skb, TCA_RED_EARLY_DROP_BLOCK, &q->qe_early_drop)) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 4074c50ac3d7..da047a37a3bf 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -265,7 +265,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return false; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 15f400dcb400..bca2be57d9fc 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -186,7 +186,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return 0; } diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index b1eb12d33b9a..e981992634dd 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1108,11 +1108,10 @@ static void setup_txtime(struct taprio_sched *q, static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) { - size_t size = sizeof(struct tc_taprio_sched_entry) * num_entries + - sizeof(struct __tc_taprio_qopt_offload); struct __tc_taprio_qopt_offload *__offload; - __offload = kzalloc(size, GFP_KERNEL); + __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), + GFP_KERNEL); if (!__offload) return NULL; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ccfa0ab3e7f4..aea2a982984d 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1033,8 +1033,6 @@ static const struct proto_ops inet6_seqpacket_ops = { .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, #endif }; @@ -1089,10 +1087,6 @@ static struct sctp_af sctp_af_inet6 = { .net_header_len = sizeof(struct ipv6hdr), .sockaddr_len = sizeof(struct sockaddr_in6), .ip_options_len = sctp_v6_ip_options_len, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ipv6_setsockopt, - .compat_getsockopt = compat_ipv6_getsockopt, -#endif }; static struct sctp_pf sctp_pf_inet6 = { diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index cde29f3c7fb3..d19db22262fd 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1036,10 +1036,6 @@ static const struct proto_ops inet_seqpacket_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif }; /* Registration with AF_INET family. */ @@ -1093,10 +1089,6 @@ static struct sctp_af sctp_af_inet = { .net_header_len = sizeof(struct iphdr), .sockaddr_len = sizeof(struct sockaddr_in), .ip_options_len = sctp_v4_ip_options_len, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_ip_setsockopt, - .compat_getsockopt = compat_ip_getsockopt, -#endif }; struct sctp_pf *sctp_get_pf_specific(sa_family_t family) @@ -1375,15 +1367,15 @@ static struct pernet_operations sctp_ctrlsock_ops = { /* Initialize the universe into something sensible. */ static __init int sctp_init(void) { - int i; - int status = -EINVAL; - unsigned long goal; - unsigned long limit; unsigned long nr_pages = totalram_pages(); + unsigned long limit; + unsigned long goal; + int max_entry_order; + int num_entries; int max_share; + int status; int order; - int num_entries; - int max_entry_order; + int i; sock_skb_cb_check_size(sizeof(struct sctp_ulpevent)); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d57e1a002ffc..ec1fba1fbe71 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -979,9 +979,8 @@ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw) * * Returns 0 if ok, <0 errno code on error. */ -static int sctp_setsockopt_bindx_kernel(struct sock *sk, - struct sockaddr *addrs, int addrs_size, - int op) +static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *addrs, + int addrs_size, int op) { int err; int addrcnt = 0; @@ -991,7 +990,7 @@ static int sctp_setsockopt_bindx_kernel(struct sock *sk, struct sctp_af *af; pr_debug("%s: sk:%p addrs:%p addrs_size:%d opt:%d\n", - __func__, sk, addrs, addrs_size, op); + __func__, sk, addr_buf, addrs_size, op); if (unlikely(addrs_size <= 0)) return -EINVAL; @@ -1037,29 +1036,13 @@ static int sctp_setsockopt_bindx_kernel(struct sock *sk, } } -static int sctp_setsockopt_bindx(struct sock *sk, - struct sockaddr __user *addrs, - int addrs_size, int op) -{ - struct sockaddr *kaddrs; - int err; - - kaddrs = memdup_user(addrs, addrs_size); - if (IS_ERR(kaddrs)) - return PTR_ERR(kaddrs); - err = sctp_setsockopt_bindx_kernel(sk, kaddrs, addrs_size, op); - kfree(kaddrs); - return err; -} - static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs, int addrlen) { int err; lock_sock(sk); - err = sctp_setsockopt_bindx_kernel(sk, addrs, addrlen, - SCTP_BINDX_ADD_ADDR); + err = sctp_setsockopt_bindx(sk, addrs, addrlen, SCTP_BINDX_ADD_ADDR); release_sock(sk); return err; } @@ -1303,36 +1286,29 @@ out_free: * it. * * sk The sk of the socket - * addrs The pointer to the addresses in user land + * addrs The pointer to the addresses * addrssize Size of the addrs buffer * * Returns >=0 if ok, <0 errno code on error. */ -static int __sctp_setsockopt_connectx(struct sock *sk, - struct sockaddr __user *addrs, - int addrs_size, - sctp_assoc_t *assoc_id) +static int __sctp_setsockopt_connectx(struct sock *sk, struct sockaddr *kaddrs, + int addrs_size, sctp_assoc_t *assoc_id) { - struct sockaddr *kaddrs; int err = 0, flags = 0; pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n", - __func__, sk, addrs, addrs_size); + __func__, sk, kaddrs, addrs_size); /* make sure the 1st addr's sa_family is accessible later */ if (unlikely(addrs_size < sizeof(sa_family_t))) return -EINVAL; - kaddrs = memdup_user(addrs, addrs_size); - if (IS_ERR(kaddrs)) - return PTR_ERR(kaddrs); - /* Allow security module to validate connectx addresses. */ err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_CONNECTX, (struct sockaddr *)kaddrs, addrs_size); if (err) - goto out_free; + return err; /* in-kernel sockets don't generally have a file allocated to them * if all they do is call sock_create_kern(). @@ -1340,12 +1316,7 @@ static int __sctp_setsockopt_connectx(struct sock *sk, if (sk->sk_socket->file) flags = sk->sk_socket->file->f_flags; - err = __sctp_connect(sk, kaddrs, addrs_size, flags, assoc_id); - -out_free: - kfree(kaddrs); - - return err; + return __sctp_connect(sk, kaddrs, addrs_size, flags, assoc_id); } /* @@ -1353,10 +1324,10 @@ out_free: * to the option that doesn't provide association id. */ static int sctp_setsockopt_connectx_old(struct sock *sk, - struct sockaddr __user *addrs, + struct sockaddr *kaddrs, int addrs_size) { - return __sctp_setsockopt_connectx(sk, addrs, addrs_size, NULL); + return __sctp_setsockopt_connectx(sk, kaddrs, addrs_size, NULL); } /* @@ -1366,13 +1337,13 @@ static int sctp_setsockopt_connectx_old(struct sock *sk, * always positive. */ static int sctp_setsockopt_connectx(struct sock *sk, - struct sockaddr __user *addrs, + struct sockaddr *kaddrs, int addrs_size) { sctp_assoc_t assoc_id = 0; int err = 0; - err = __sctp_setsockopt_connectx(sk, addrs, addrs_size, &assoc_id); + err = __sctp_setsockopt_connectx(sk, kaddrs, addrs_size, &assoc_id); if (err) return err; @@ -1402,6 +1373,7 @@ static int sctp_getsockopt_connectx3(struct sock *sk, int len, { struct sctp_getaddrs_old param; sctp_assoc_t assoc_id = 0; + struct sockaddr *kaddrs; int err = 0; #ifdef CONFIG_COMPAT @@ -1425,9 +1397,12 @@ static int sctp_getsockopt_connectx3(struct sock *sk, int len, return -EFAULT; } - err = __sctp_setsockopt_connectx(sk, (struct sockaddr __user *) - param.addrs, param.addr_num, - &assoc_id); + kaddrs = memdup_user(param.addrs, param.addr_num); + if (IS_ERR(kaddrs)) + return PTR_ERR(kaddrs); + + err = __sctp_setsockopt_connectx(sk, kaddrs, param.addr_num, &assoc_id); + kfree(kaddrs); if (err == 0 || err == -EINPROGRESS) { if (copy_to_user(optval, &assoc_id, sizeof(assoc_id))) return -EFAULT; @@ -2209,28 +2184,18 @@ out: * exceeds the current PMTU size, the message will NOT be sent and * instead a error will be indicated to the user. */ -static int sctp_setsockopt_disable_fragments(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_disable_fragments(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (optlen < sizeof(int)) return -EINVAL; - - if (get_user(val, (int __user *)optval)) - return -EFAULT; - - sctp_sk(sk)->disable_fragments = (val == 0) ? 0 : 1; - + sctp_sk(sk)->disable_fragments = (*val == 0) ? 0 : 1; return 0; } -static int sctp_setsockopt_events(struct sock *sk, char __user *optval, +static int sctp_setsockopt_events(struct sock *sk, __u8 *sn_type, unsigned int optlen) { - struct sctp_event_subscribe subscribe; - __u8 *sn_type = (__u8 *)&subscribe; struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; int i; @@ -2238,9 +2203,6 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, if (optlen > sizeof(struct sctp_event_subscribe)) return -EINVAL; - if (copy_from_user(&subscribe, optval, optlen)) - return -EFAULT; - for (i = 0; i < optlen; i++) sctp_ulpevent_type_set(&sp->subscribe, SCTP_SN_TYPE_BASE + i, sn_type[i]); @@ -2280,7 +2242,7 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, * integer defining the number of seconds of idle time before an * association is closed. */ -static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, +static int sctp_setsockopt_autoclose(struct sock *sk, u32 *optval, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); @@ -2291,9 +2253,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, return -EOPNOTSUPP; if (optlen != sizeof(int)) return -EINVAL; - if (copy_from_user(&sp->autoclose, optval, optlen)) - return -EFAULT; + sp->autoclose = *optval; if (sp->autoclose > net->sctp.max_autoclose) sp->autoclose = net->sctp.max_autoclose; @@ -2628,48 +2589,42 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, } static int sctp_setsockopt_peer_addr_params(struct sock *sk, - char __user *optval, + struct sctp_paddrparams *params, unsigned int optlen) { - struct sctp_paddrparams params; struct sctp_transport *trans = NULL; struct sctp_association *asoc = NULL; struct sctp_sock *sp = sctp_sk(sk); int error; int hb_change, pmtud_change, sackdelay_change; - if (optlen == sizeof(params)) { - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - } else if (optlen == ALIGN(offsetof(struct sctp_paddrparams, + if (optlen == ALIGN(offsetof(struct sctp_paddrparams, spp_ipv6_flowlabel), 4)) { - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - if (params.spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL)) + if (params->spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL)) return -EINVAL; - } else { + } else if (optlen != sizeof(*params)) { return -EINVAL; } /* Validate flags and value parameters. */ - hb_change = params.spp_flags & SPP_HB; - pmtud_change = params.spp_flags & SPP_PMTUD; - sackdelay_change = params.spp_flags & SPP_SACKDELAY; + hb_change = params->spp_flags & SPP_HB; + pmtud_change = params->spp_flags & SPP_PMTUD; + sackdelay_change = params->spp_flags & SPP_SACKDELAY; if (hb_change == SPP_HB || pmtud_change == SPP_PMTUD || sackdelay_change == SPP_SACKDELAY || - params.spp_sackdelay > 500 || - (params.spp_pathmtu && - params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) + params->spp_sackdelay > 500 || + (params->spp_pathmtu && + params->spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) return -EINVAL; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ - if (!sctp_is_any(sk, (union sctp_addr *)¶ms.spp_address)) { - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); + if (!sctp_is_any(sk, (union sctp_addr *)¶ms->spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms->spp_address, + params->spp_assoc_id); if (!trans) return -EINVAL; } @@ -2678,19 +2633,19 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ - asoc = sctp_id2assoc(sk, params.spp_assoc_id); - if (!asoc && params.spp_assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->spp_assoc_id); + if (!asoc && params->spp_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Heartbeat demand can only be sent on a transport or * association, but not a socket. */ - if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc) + if (params->spp_flags & SPP_HB_DEMAND && !trans && !asoc) return -EINVAL; /* Process parameters. */ - error = sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + error = sctp_apply_peer_addr_params(params, trans, asoc, sp, hb_change, pmtud_change, sackdelay_change); @@ -2703,7 +2658,7 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, if (!trans && asoc) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { - sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + sctp_apply_peer_addr_params(params, trans, asoc, sp, hb_change, pmtud_change, sackdelay_change); } @@ -2794,83 +2749,86 @@ static void sctp_apply_asoc_delayed_ack(struct sctp_sack_info *params, * timer to expire. The default value for this is 2, setting this * value to 1 will disable the delayed sack algorithm. */ - -static int sctp_setsockopt_delayed_ack(struct sock *sk, - char __user *optval, unsigned int optlen) +static int __sctp_setsockopt_delayed_ack(struct sock *sk, + struct sctp_sack_info *params) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_sack_info params; - - if (optlen == sizeof(struct sctp_sack_info)) { - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - - if (params.sack_delay == 0 && params.sack_freq == 0) - return 0; - } else if (optlen == sizeof(struct sctp_assoc_value)) { - pr_warn_ratelimited(DEPRECATED - "%s (pid %d) " - "Use of struct sctp_assoc_value in delayed_ack socket option.\n" - "Use struct sctp_sack_info instead\n", - current->comm, task_pid_nr(current)); - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - - if (params.sack_delay == 0) - params.sack_freq = 1; - else - params.sack_freq = 0; - } else - return -EINVAL; /* Validate value parameter. */ - if (params.sack_delay > 500) + if (params->sack_delay > 500) return -EINVAL; /* Get association, if sack_assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ - asoc = sctp_id2assoc(sk, params.sack_assoc_id); - if (!asoc && params.sack_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, params->sack_assoc_id); + if (!asoc && params->sack_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { - sctp_apply_asoc_delayed_ack(¶ms, asoc); + sctp_apply_asoc_delayed_ack(params, asoc); return 0; } if (sctp_style(sk, TCP)) - params.sack_assoc_id = SCTP_FUTURE_ASSOC; + params->sack_assoc_id = SCTP_FUTURE_ASSOC; - if (params.sack_assoc_id == SCTP_FUTURE_ASSOC || - params.sack_assoc_id == SCTP_ALL_ASSOC) { - if (params.sack_delay) { - sp->sackdelay = params.sack_delay; + if (params->sack_assoc_id == SCTP_FUTURE_ASSOC || + params->sack_assoc_id == SCTP_ALL_ASSOC) { + if (params->sack_delay) { + sp->sackdelay = params->sack_delay; sp->param_flags = sctp_spp_sackdelay_enable(sp->param_flags); } - if (params.sack_freq == 1) { + if (params->sack_freq == 1) { sp->param_flags = sctp_spp_sackdelay_disable(sp->param_flags); - } else if (params.sack_freq > 1) { - sp->sackfreq = params.sack_freq; + } else if (params->sack_freq > 1) { + sp->sackfreq = params->sack_freq; sp->param_flags = sctp_spp_sackdelay_enable(sp->param_flags); } } - if (params.sack_assoc_id == SCTP_CURRENT_ASSOC || - params.sack_assoc_id == SCTP_ALL_ASSOC) + if (params->sack_assoc_id == SCTP_CURRENT_ASSOC || + params->sack_assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) - sctp_apply_asoc_delayed_ack(¶ms, asoc); + sctp_apply_asoc_delayed_ack(params, asoc); return 0; } +static int sctp_setsockopt_delayed_ack(struct sock *sk, + struct sctp_sack_info *params, + unsigned int optlen) +{ + if (optlen == sizeof(struct sctp_assoc_value)) { + struct sctp_assoc_value *v = (struct sctp_assoc_value *)params; + struct sctp_sack_info p; + + pr_warn_ratelimited(DEPRECATED + "%s (pid %d) " + "Use of struct sctp_assoc_value in delayed_ack socket option.\n" + "Use struct sctp_sack_info instead\n", + current->comm, task_pid_nr(current)); + + p.sack_assoc_id = v->assoc_id; + p.sack_delay = v->assoc_value; + p.sack_freq = v->assoc_value ? 0 : 1; + return __sctp_setsockopt_delayed_ack(sk, &p); + } + + if (optlen != sizeof(struct sctp_sack_info)) + return -EINVAL; + if (params->sack_delay == 0 && params->sack_freq == 0) + return 0; + return __sctp_setsockopt_delayed_ack(sk, params); +} + /* 7.1.3 Initialization Parameters (SCTP_INITMSG) * * Applications can specify protocol parameters for the default association @@ -2882,24 +2840,22 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk, * by the change). With TCP-style sockets, this option is inherited by * sockets derived from a listener socket. */ -static int sctp_setsockopt_initmsg(struct sock *sk, char __user *optval, unsigned int optlen) +static int sctp_setsockopt_initmsg(struct sock *sk, struct sctp_initmsg *sinit, + unsigned int optlen) { - struct sctp_initmsg sinit; struct sctp_sock *sp = sctp_sk(sk); if (optlen != sizeof(struct sctp_initmsg)) return -EINVAL; - if (copy_from_user(&sinit, optval, optlen)) - return -EFAULT; - if (sinit.sinit_num_ostreams) - sp->initmsg.sinit_num_ostreams = sinit.sinit_num_ostreams; - if (sinit.sinit_max_instreams) - sp->initmsg.sinit_max_instreams = sinit.sinit_max_instreams; - if (sinit.sinit_max_attempts) - sp->initmsg.sinit_max_attempts = sinit.sinit_max_attempts; - if (sinit.sinit_max_init_timeo) - sp->initmsg.sinit_max_init_timeo = sinit.sinit_max_init_timeo; + if (sinit->sinit_num_ostreams) + sp->initmsg.sinit_num_ostreams = sinit->sinit_num_ostreams; + if (sinit->sinit_max_instreams) + sp->initmsg.sinit_max_instreams = sinit->sinit_max_instreams; + if (sinit->sinit_max_attempts) + sp->initmsg.sinit_max_attempts = sinit->sinit_max_attempts; + if (sinit->sinit_max_init_timeo) + sp->initmsg.sinit_max_init_timeo = sinit->sinit_max_init_timeo; return 0; } @@ -2919,57 +2875,54 @@ static int sctp_setsockopt_initmsg(struct sock *sk, char __user *optval, unsigne * to this call if the caller is using the UDP model. */ static int sctp_setsockopt_default_send_param(struct sock *sk, - char __user *optval, + struct sctp_sndrcvinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_sndrcvinfo info; - if (optlen != sizeof(info)) + if (optlen != sizeof(*info)) return -EINVAL; - if (copy_from_user(&info, optval, optlen)) - return -EFAULT; - if (info.sinfo_flags & + if (info->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_ABORT | SCTP_EOF)) return -EINVAL; - asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); - if (!asoc && info.sinfo_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, info->sinfo_assoc_id); + if (!asoc && info->sinfo_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { - asoc->default_stream = info.sinfo_stream; - asoc->default_flags = info.sinfo_flags; - asoc->default_ppid = info.sinfo_ppid; - asoc->default_context = info.sinfo_context; - asoc->default_timetolive = info.sinfo_timetolive; + asoc->default_stream = info->sinfo_stream; + asoc->default_flags = info->sinfo_flags; + asoc->default_ppid = info->sinfo_ppid; + asoc->default_context = info->sinfo_context; + asoc->default_timetolive = info->sinfo_timetolive; return 0; } if (sctp_style(sk, TCP)) - info.sinfo_assoc_id = SCTP_FUTURE_ASSOC; + info->sinfo_assoc_id = SCTP_FUTURE_ASSOC; - if (info.sinfo_assoc_id == SCTP_FUTURE_ASSOC || - info.sinfo_assoc_id == SCTP_ALL_ASSOC) { - sp->default_stream = info.sinfo_stream; - sp->default_flags = info.sinfo_flags; - sp->default_ppid = info.sinfo_ppid; - sp->default_context = info.sinfo_context; - sp->default_timetolive = info.sinfo_timetolive; + if (info->sinfo_assoc_id == SCTP_FUTURE_ASSOC || + info->sinfo_assoc_id == SCTP_ALL_ASSOC) { + sp->default_stream = info->sinfo_stream; + sp->default_flags = info->sinfo_flags; + sp->default_ppid = info->sinfo_ppid; + sp->default_context = info->sinfo_context; + sp->default_timetolive = info->sinfo_timetolive; } - if (info.sinfo_assoc_id == SCTP_CURRENT_ASSOC || - info.sinfo_assoc_id == SCTP_ALL_ASSOC) { + if (info->sinfo_assoc_id == SCTP_CURRENT_ASSOC || + info->sinfo_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { - asoc->default_stream = info.sinfo_stream; - asoc->default_flags = info.sinfo_flags; - asoc->default_ppid = info.sinfo_ppid; - asoc->default_context = info.sinfo_context; - asoc->default_timetolive = info.sinfo_timetolive; + asoc->default_stream = info->sinfo_stream; + asoc->default_flags = info->sinfo_flags; + asoc->default_ppid = info->sinfo_ppid; + asoc->default_context = info->sinfo_context; + asoc->default_timetolive = info->sinfo_timetolive; } } @@ -2980,54 +2933,51 @@ static int sctp_setsockopt_default_send_param(struct sock *sk, * (SCTP_DEFAULT_SNDINFO) */ static int sctp_setsockopt_default_sndinfo(struct sock *sk, - char __user *optval, + struct sctp_sndinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_sndinfo info; - if (optlen != sizeof(info)) + if (optlen != sizeof(*info)) return -EINVAL; - if (copy_from_user(&info, optval, optlen)) - return -EFAULT; - if (info.snd_flags & + if (info->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_ABORT | SCTP_EOF)) return -EINVAL; - asoc = sctp_id2assoc(sk, info.snd_assoc_id); - if (!asoc && info.snd_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, info->snd_assoc_id); + if (!asoc && info->snd_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { - asoc->default_stream = info.snd_sid; - asoc->default_flags = info.snd_flags; - asoc->default_ppid = info.snd_ppid; - asoc->default_context = info.snd_context; + asoc->default_stream = info->snd_sid; + asoc->default_flags = info->snd_flags; + asoc->default_ppid = info->snd_ppid; + asoc->default_context = info->snd_context; return 0; } if (sctp_style(sk, TCP)) - info.snd_assoc_id = SCTP_FUTURE_ASSOC; + info->snd_assoc_id = SCTP_FUTURE_ASSOC; - if (info.snd_assoc_id == SCTP_FUTURE_ASSOC || - info.snd_assoc_id == SCTP_ALL_ASSOC) { - sp->default_stream = info.snd_sid; - sp->default_flags = info.snd_flags; - sp->default_ppid = info.snd_ppid; - sp->default_context = info.snd_context; + if (info->snd_assoc_id == SCTP_FUTURE_ASSOC || + info->snd_assoc_id == SCTP_ALL_ASSOC) { + sp->default_stream = info->snd_sid; + sp->default_flags = info->snd_flags; + sp->default_ppid = info->snd_ppid; + sp->default_context = info->snd_context; } - if (info.snd_assoc_id == SCTP_CURRENT_ASSOC || - info.snd_assoc_id == SCTP_ALL_ASSOC) { + if (info->snd_assoc_id == SCTP_CURRENT_ASSOC || + info->snd_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { - asoc->default_stream = info.snd_sid; - asoc->default_flags = info.snd_flags; - asoc->default_ppid = info.snd_ppid; - asoc->default_context = info.snd_context; + asoc->default_stream = info->snd_sid; + asoc->default_flags = info->snd_flags; + asoc->default_ppid = info->snd_ppid; + asoc->default_context = info->snd_context; } } @@ -3040,10 +2990,9 @@ static int sctp_setsockopt_default_sndinfo(struct sock *sk, * the association primary. The enclosed address must be one of the * association peer's addresses. */ -static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval, +static int sctp_setsockopt_primary_addr(struct sock *sk, struct sctp_prim *prim, unsigned int optlen) { - struct sctp_prim prim; struct sctp_transport *trans; struct sctp_af *af; int err; @@ -3051,21 +3000,18 @@ static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval, if (optlen != sizeof(struct sctp_prim)) return -EINVAL; - if (copy_from_user(&prim, optval, sizeof(struct sctp_prim))) - return -EFAULT; - /* Allow security module to validate address but need address len. */ - af = sctp_get_af_specific(prim.ssp_addr.ss_family); + af = sctp_get_af_specific(prim->ssp_addr.ss_family); if (!af) return -EINVAL; err = security_sctp_bind_connect(sk, SCTP_PRIMARY_ADDR, - (struct sockaddr *)&prim.ssp_addr, + (struct sockaddr *)&prim->ssp_addr, af->sockaddr_len); if (err) return err; - trans = sctp_addr_id2transport(sk, &prim.ssp_addr, prim.ssp_assoc_id); + trans = sctp_addr_id2transport(sk, &prim->ssp_addr, prim->ssp_assoc_id); if (!trans) return -EINVAL; @@ -3082,17 +3028,12 @@ static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval, * introduced, at the cost of more packets in the network. Expects an * integer boolean flag. */ -static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval, +static int sctp_setsockopt_nodelay(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; - - sctp_sk(sk)->nodelay = (val == 0) ? 0 : 1; + sctp_sk(sk)->nodelay = (*val == 0) ? 0 : 1; return 0; } @@ -3108,9 +3049,10 @@ static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval, * be changed. * */ -static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigned int optlen) +static int sctp_setsockopt_rtoinfo(struct sock *sk, + struct sctp_rtoinfo *rtoinfo, + unsigned int optlen) { - struct sctp_rtoinfo rtoinfo; struct sctp_association *asoc; unsigned long rto_min, rto_max; struct sctp_sock *sp = sctp_sk(sk); @@ -3118,18 +3060,15 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigne if (optlen != sizeof (struct sctp_rtoinfo)) return -EINVAL; - if (copy_from_user(&rtoinfo, optval, optlen)) - return -EFAULT; - - asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id); + asoc = sctp_id2assoc(sk, rtoinfo->srto_assoc_id); /* Set the values to the specific association */ - if (!asoc && rtoinfo.srto_assoc_id != SCTP_FUTURE_ASSOC && + if (!asoc && rtoinfo->srto_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; - rto_max = rtoinfo.srto_max; - rto_min = rtoinfo.srto_min; + rto_max = rtoinfo->srto_max; + rto_min = rtoinfo->srto_min; if (rto_max) rto_max = asoc ? msecs_to_jiffies(rto_max) : rto_max; @@ -3145,17 +3084,17 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigne return -EINVAL; if (asoc) { - if (rtoinfo.srto_initial != 0) + if (rtoinfo->srto_initial != 0) asoc->rto_initial = - msecs_to_jiffies(rtoinfo.srto_initial); + msecs_to_jiffies(rtoinfo->srto_initial); asoc->rto_max = rto_max; asoc->rto_min = rto_min; } else { /* If there is no association or the association-id = 0 * set the values to the endpoint. */ - if (rtoinfo.srto_initial != 0) - sp->rtoinfo.srto_initial = rtoinfo.srto_initial; + if (rtoinfo->srto_initial != 0) + sp->rtoinfo.srto_initial = rtoinfo->srto_initial; sp->rtoinfo.srto_max = rto_max; sp->rtoinfo.srto_min = rto_min; } @@ -3174,26 +3113,25 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigne * See [SCTP] for more information. * */ -static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, unsigned int optlen) +static int sctp_setsockopt_associnfo(struct sock *sk, + struct sctp_assocparams *assocparams, + unsigned int optlen) { - struct sctp_assocparams assocparams; struct sctp_association *asoc; if (optlen != sizeof(struct sctp_assocparams)) return -EINVAL; - if (copy_from_user(&assocparams, optval, optlen)) - return -EFAULT; - asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id); + asoc = sctp_id2assoc(sk, assocparams->sasoc_assoc_id); - if (!asoc && assocparams.sasoc_assoc_id != SCTP_FUTURE_ASSOC && + if (!asoc && assocparams->sasoc_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Set the values to the specific association */ if (asoc) { - if (assocparams.sasoc_asocmaxrxt != 0) { + if (assocparams->sasoc_asocmaxrxt != 0) { __u32 path_sum = 0; int paths = 0; struct sctp_transport *peer_addr; @@ -3210,24 +3148,25 @@ static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, unsig * then one path. */ if (paths > 1 && - assocparams.sasoc_asocmaxrxt > path_sum) + assocparams->sasoc_asocmaxrxt > path_sum) return -EINVAL; - asoc->max_retrans = assocparams.sasoc_asocmaxrxt; + asoc->max_retrans = assocparams->sasoc_asocmaxrxt; } - if (assocparams.sasoc_cookie_life != 0) - asoc->cookie_life = ms_to_ktime(assocparams.sasoc_cookie_life); + if (assocparams->sasoc_cookie_life != 0) + asoc->cookie_life = + ms_to_ktime(assocparams->sasoc_cookie_life); } else { /* Set the values to the endpoint */ struct sctp_sock *sp = sctp_sk(sk); - if (assocparams.sasoc_asocmaxrxt != 0) + if (assocparams->sasoc_asocmaxrxt != 0) sp->assocparams.sasoc_asocmaxrxt = - assocparams.sasoc_asocmaxrxt; - if (assocparams.sasoc_cookie_life != 0) + assocparams->sasoc_asocmaxrxt; + if (assocparams->sasoc_cookie_life != 0) sp->assocparams.sasoc_cookie_life = - assocparams.sasoc_cookie_life; + assocparams->sasoc_cookie_life; } return 0; } @@ -3242,16 +3181,14 @@ static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, unsig * addresses and a user will receive both PF_INET6 and PF_INET type * addresses on the socket. */ -static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsigned int optlen) +static int sctp_setsockopt_mappedv4(struct sock *sk, int *val, + unsigned int optlen) { - int val; struct sctp_sock *sp = sctp_sk(sk); if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; - if (val) + if (*val) sp->v4mapped = 1; else sp->v4mapped = 0; @@ -3286,11 +3223,13 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsign * changed (effecting future associations only). * assoc_value: This parameter specifies the maximum size in bytes. */ -static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen) +static int sctp_setsockopt_maxseg(struct sock *sk, + struct sctp_assoc_value *params, + unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); - struct sctp_assoc_value params; struct sctp_association *asoc; + sctp_assoc_t assoc_id; int val; if (optlen == sizeof(int)) { @@ -3299,19 +3238,17 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned "Use of int in maxseg socket option.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); - if (copy_from_user(&val, optval, optlen)) - return -EFAULT; - params.assoc_id = SCTP_FUTURE_ASSOC; + assoc_id = SCTP_FUTURE_ASSOC; + val = *(int *)params; } else if (optlen == sizeof(struct sctp_assoc_value)) { - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - val = params.assoc_value; + assoc_id = params->assoc_id; + val = params->assoc_value; } else { return -EINVAL; } - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, assoc_id); + if (!asoc && assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; @@ -3346,12 +3283,12 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned * locally bound addresses. The following structure is used to make a * set primary request: */ -static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optval, +static int sctp_setsockopt_peer_primary_addr(struct sock *sk, + struct sctp_setpeerprim *prim, unsigned int optlen) { struct sctp_sock *sp; struct sctp_association *asoc = NULL; - struct sctp_setpeerprim prim; struct sctp_chunk *chunk; struct sctp_af *af; int err; @@ -3364,10 +3301,7 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva if (optlen != sizeof(struct sctp_setpeerprim)) return -EINVAL; - if (copy_from_user(&prim, optval, optlen)) - return -EFAULT; - - asoc = sctp_id2assoc(sk, prim.sspp_assoc_id); + asoc = sctp_id2assoc(sk, prim->sspp_assoc_id); if (!asoc) return -EINVAL; @@ -3380,26 +3314,26 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva if (!sctp_state(asoc, ESTABLISHED)) return -ENOTCONN; - af = sctp_get_af_specific(prim.sspp_addr.ss_family); + af = sctp_get_af_specific(prim->sspp_addr.ss_family); if (!af) return -EINVAL; - if (!af->addr_valid((union sctp_addr *)&prim.sspp_addr, sp, NULL)) + if (!af->addr_valid((union sctp_addr *)&prim->sspp_addr, sp, NULL)) return -EADDRNOTAVAIL; - if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim.sspp_addr)) + if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim->sspp_addr)) return -EADDRNOTAVAIL; /* Allow security module to validate address. */ err = security_sctp_bind_connect(sk, SCTP_SET_PEER_PRIMARY_ADDR, - (struct sockaddr *)&prim.sspp_addr, + (struct sockaddr *)&prim->sspp_addr, af->sockaddr_len); if (err) return err; /* Create an ASCONF chunk with SET_PRIMARY parameter */ chunk = sctp_make_asconf_set_prim(asoc, - (union sctp_addr *)&prim.sspp_addr); + (union sctp_addr *)&prim->sspp_addr); if (!chunk) return -ENOMEM; @@ -3410,17 +3344,14 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva return err; } -static int sctp_setsockopt_adaptation_layer(struct sock *sk, char __user *optval, +static int sctp_setsockopt_adaptation_layer(struct sock *sk, + struct sctp_setadaptation *adapt, unsigned int optlen) { - struct sctp_setadaptation adaptation; - if (optlen != sizeof(struct sctp_setadaptation)) return -EINVAL; - if (copy_from_user(&adaptation, optval, optlen)) - return -EFAULT; - sctp_sk(sk)->adaptation_ind = adaptation.ssb_adaptation_ind; + sctp_sk(sk)->adaptation_ind = adapt->ssb_adaptation_ind; return 0; } @@ -3439,40 +3370,38 @@ static int sctp_setsockopt_adaptation_layer(struct sock *sk, char __user *optval * received messages from the peer and does not effect the value that is * saved with outbound messages. */ -static int sctp_setsockopt_context(struct sock *sk, char __user *optval, +static int sctp_setsockopt_context(struct sock *sk, + struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); - struct sctp_assoc_value params; struct sctp_association *asoc; if (optlen != sizeof(struct sctp_assoc_value)) return -EINVAL; - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { - asoc->default_rcv_context = params.assoc_value; + asoc->default_rcv_context = params->assoc_value; return 0; } if (sctp_style(sk, TCP)) - params.assoc_id = SCTP_FUTURE_ASSOC; + params->assoc_id = SCTP_FUTURE_ASSOC; - if (params.assoc_id == SCTP_FUTURE_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) - sp->default_rcv_context = params.assoc_value; + if (params->assoc_id == SCTP_FUTURE_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) + sp->default_rcv_context = params->assoc_value; - if (params.assoc_id == SCTP_CURRENT_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) + if (params->assoc_id == SCTP_CURRENT_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) - asoc->default_rcv_context = params.assoc_value; + asoc->default_rcv_context = params->assoc_value; return 0; } @@ -3501,18 +3430,13 @@ static int sctp_setsockopt_context(struct sock *sk, char __user *optval, * application using the one to many model may become confused and act * incorrectly. */ -static int sctp_setsockopt_fragment_interleave(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_fragment_interleave(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (optlen != sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; - sctp_sk(sk)->frag_interleave = !!val; + sctp_sk(sk)->frag_interleave = !!*val; if (!sctp_sk(sk)->frag_interleave) sctp_sk(sk)->ep->intl_enable = 0; @@ -3537,24 +3461,19 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk, * call as long as the user provided buffer is large enough to hold the * message. */ -static int sctp_setsockopt_partial_delivery_point(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_partial_delivery_point(struct sock *sk, u32 *val, unsigned int optlen) { - u32 val; - if (optlen != sizeof(u32)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; /* Note: We double the receive buffer from what the user sets * it to be, also initial rwnd is based on rcvbuf/2. */ - if (val > (sk->sk_rcvbuf >> 1)) + if (*val > (sk->sk_rcvbuf >> 1)) return -EINVAL; - sctp_sk(sk)->pd_point = val; + sctp_sk(sk)->pd_point = *val; return 0; /* is this the right error code? */ } @@ -3571,12 +3490,13 @@ static int sctp_setsockopt_partial_delivery_point(struct sock *sk, * future associations inheriting the socket value. */ static int sctp_setsockopt_maxburst(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); - struct sctp_assoc_value params; struct sctp_association *asoc; + sctp_assoc_t assoc_id; + u32 assoc_value; if (optlen == sizeof(int)) { pr_warn_ratelimited(DEPRECATED @@ -3584,37 +3504,33 @@ static int sctp_setsockopt_maxburst(struct sock *sk, "Use of int in max_burst socket option deprecated.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); - if (copy_from_user(¶ms.assoc_value, optval, optlen)) - return -EFAULT; - params.assoc_id = SCTP_FUTURE_ASSOC; + assoc_id = SCTP_FUTURE_ASSOC; + assoc_value = *((int *)params); } else if (optlen == sizeof(struct sctp_assoc_value)) { - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; + assoc_id = params->assoc_id; + assoc_value = params->assoc_value; } else return -EINVAL; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id > SCTP_ALL_ASSOC && - sctp_style(sk, UDP)) + asoc = sctp_id2assoc(sk, assoc_id); + if (!asoc && assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { - asoc->max_burst = params.assoc_value; + asoc->max_burst = assoc_value; return 0; } if (sctp_style(sk, TCP)) - params.assoc_id = SCTP_FUTURE_ASSOC; + assoc_id = SCTP_FUTURE_ASSOC; - if (params.assoc_id == SCTP_FUTURE_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) - sp->max_burst = params.assoc_value; + if (assoc_id == SCTP_FUTURE_ASSOC || assoc_id == SCTP_ALL_ASSOC) + sp->max_burst = assoc_value; - if (params.assoc_id == SCTP_CURRENT_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) + if (assoc_id == SCTP_CURRENT_ASSOC || assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) - asoc->max_burst = params.assoc_value; + asoc->max_burst = assoc_value; return 0; } @@ -3627,21 +3543,18 @@ static int sctp_setsockopt_maxburst(struct sock *sk, * will only effect future associations on the socket. */ static int sctp_setsockopt_auth_chunk(struct sock *sk, - char __user *optval, + struct sctp_authchunk *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; - struct sctp_authchunk val; if (!ep->auth_enable) return -EACCES; if (optlen != sizeof(struct sctp_authchunk)) return -EINVAL; - if (copy_from_user(&val, optval, optlen)) - return -EFAULT; - switch (val.sauth_chunk) { + switch (val->sauth_chunk) { case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: @@ -3650,7 +3563,7 @@ static int sctp_setsockopt_auth_chunk(struct sock *sk, } /* add this chunk id to the endpoint */ - return sctp_auth_ep_add_chunkid(ep, val.sauth_chunk); + return sctp_auth_ep_add_chunkid(ep, val->sauth_chunk); } /* @@ -3660,13 +3573,11 @@ static int sctp_setsockopt_auth_chunk(struct sock *sk, * endpoint requires the peer to use. */ static int sctp_setsockopt_hmac_ident(struct sock *sk, - char __user *optval, + struct sctp_hmacalgo *hmacs, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; - struct sctp_hmacalgo *hmacs; u32 idents; - int err; if (!ep->auth_enable) return -EACCES; @@ -3676,21 +3587,12 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk, optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) + SCTP_AUTH_NUM_HMACS * sizeof(u16)); - hmacs = memdup_user(optval, optlen); - if (IS_ERR(hmacs)) - return PTR_ERR(hmacs); - idents = hmacs->shmac_num_idents; if (idents == 0 || idents > SCTP_AUTH_NUM_HMACS || - (idents * sizeof(u16)) > (optlen - sizeof(struct sctp_hmacalgo))) { - err = -EINVAL; - goto out; - } + (idents * sizeof(u16)) > (optlen - sizeof(struct sctp_hmacalgo))) + return -EINVAL; - err = sctp_auth_ep_set_hmacs(ep, hmacs); -out: - kfree(hmacs); - return err; + return sctp_auth_ep_set_hmacs(ep, hmacs); } /* @@ -3700,11 +3602,10 @@ out: * association shared key. */ static int sctp_setsockopt_auth_key(struct sock *sk, - char __user *optval, + struct sctp_authkey *authkey, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; - struct sctp_authkey *authkey; struct sctp_association *asoc; int ret = -EINVAL; @@ -3715,10 +3616,6 @@ static int sctp_setsockopt_auth_key(struct sock *sk, */ optlen = min_t(unsigned int, optlen, USHRT_MAX + sizeof(*authkey)); - authkey = memdup_user(optval, optlen); - if (IS_ERR(authkey)) - return PTR_ERR(authkey); - if (authkey->sca_keylength > optlen - sizeof(*authkey)) goto out; @@ -3755,7 +3652,7 @@ static int sctp_setsockopt_auth_key(struct sock *sk, } out: - kzfree(authkey); + memzero_explicit(authkey, optlen); return ret; } @@ -3766,42 +3663,39 @@ out: * the association shared key. */ static int sctp_setsockopt_active_key(struct sock *sk, - char __user *optval, + struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; - struct sctp_authkeyid val; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; - if (copy_from_user(&val, optval, optlen)) - return -EFAULT; - asoc = sctp_id2assoc(sk, val.scact_assoc_id); - if (!asoc && val.scact_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, val->scact_assoc_id); + if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) - return sctp_auth_set_active_key(ep, asoc, val.scact_keynumber); + return sctp_auth_set_active_key(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) - val.scact_assoc_id = SCTP_FUTURE_ASSOC; + val->scact_assoc_id = SCTP_FUTURE_ASSOC; - if (val.scact_assoc_id == SCTP_FUTURE_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { - ret = sctp_auth_set_active_key(ep, asoc, val.scact_keynumber); + if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { + ret = sctp_auth_set_active_key(ep, asoc, val->scact_keynumber); if (ret) return ret; } - if (val.scact_assoc_id == SCTP_CURRENT_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { + if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_set_active_key(ep, asoc, - val.scact_keynumber); + val->scact_keynumber); if (res && !ret) ret = res; @@ -3817,42 +3711,39 @@ static int sctp_setsockopt_active_key(struct sock *sk, * This set option will delete a shared secret key from use. */ static int sctp_setsockopt_del_key(struct sock *sk, - char __user *optval, + struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; - struct sctp_authkeyid val; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; - if (copy_from_user(&val, optval, optlen)) - return -EFAULT; - asoc = sctp_id2assoc(sk, val.scact_assoc_id); - if (!asoc && val.scact_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, val->scact_assoc_id); + if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) - return sctp_auth_del_key_id(ep, asoc, val.scact_keynumber); + return sctp_auth_del_key_id(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) - val.scact_assoc_id = SCTP_FUTURE_ASSOC; + val->scact_assoc_id = SCTP_FUTURE_ASSOC; - if (val.scact_assoc_id == SCTP_FUTURE_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { - ret = sctp_auth_del_key_id(ep, asoc, val.scact_keynumber); + if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { + ret = sctp_auth_del_key_id(ep, asoc, val->scact_keynumber); if (ret) return ret; } - if (val.scact_assoc_id == SCTP_CURRENT_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { + if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_del_key_id(ep, asoc, - val.scact_keynumber); + val->scact_keynumber); if (res && !ret) ret = res; @@ -3867,42 +3758,40 @@ static int sctp_setsockopt_del_key(struct sock *sk, * * This set option will deactivate a shared secret key. */ -static int sctp_setsockopt_deactivate_key(struct sock *sk, char __user *optval, +static int sctp_setsockopt_deactivate_key(struct sock *sk, + struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; - struct sctp_authkeyid val; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; - if (copy_from_user(&val, optval, optlen)) - return -EFAULT; - asoc = sctp_id2assoc(sk, val.scact_assoc_id); - if (!asoc && val.scact_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, val->scact_assoc_id); + if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) - return sctp_auth_deact_key_id(ep, asoc, val.scact_keynumber); + return sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) - val.scact_assoc_id = SCTP_FUTURE_ASSOC; + val->scact_assoc_id = SCTP_FUTURE_ASSOC; - if (val.scact_assoc_id == SCTP_FUTURE_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { - ret = sctp_auth_deact_key_id(ep, asoc, val.scact_keynumber); + if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { + ret = sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber); if (ret) return ret; } - if (val.scact_assoc_id == SCTP_CURRENT_ASSOC || - val.scact_assoc_id == SCTP_ALL_ASSOC) { + if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || + val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_deact_key_id(ep, asoc, - val.scact_keynumber); + val->scact_keynumber); if (res && !ret) ret = res; @@ -3926,26 +3815,23 @@ static int sctp_setsockopt_deactivate_key(struct sock *sk, char __user *optval, * Note. In this implementation, socket operation overrides default parameter * being set by sysctl as well as FreeBSD implementation */ -static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, +static int sctp_setsockopt_auto_asconf(struct sock *sk, int *val, unsigned int optlen) { - int val; struct sctp_sock *sp = sctp_sk(sk); if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; - if (!sctp_is_ep_boundall(sk) && val) + if (!sctp_is_ep_boundall(sk) && *val) return -EINVAL; - if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf)) + if ((*val && sp->do_auto_asconf) || (!*val && !sp->do_auto_asconf)) return 0; spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock); - if (val == 0 && sp->do_auto_asconf) { + if (*val == 0 && sp->do_auto_asconf) { list_del(&sp->auto_asconf_list); sp->do_auto_asconf = 0; - } else if (val && !sp->do_auto_asconf) { + } else if (*val && !sp->do_auto_asconf) { list_add_tail(&sp->auto_asconf_list, &sock_net(sk)->sctp.auto_asconf_splist); sp->do_auto_asconf = 1; @@ -3962,176 +3848,154 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt */ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, - char __user *optval, + struct sctp_paddrthlds_v2 *val, unsigned int optlen, bool v2) { - struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; int len; - len = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); + len = v2 ? sizeof(*val) : sizeof(struct sctp_paddrthlds); if (optlen < len) return -EINVAL; - if (copy_from_user(&val, optval, len)) - return -EFAULT; - if (v2 && val.spt_pathpfthld > val.spt_pathcpthld) + if (v2 && val->spt_pathpfthld > val->spt_pathcpthld) return -EINVAL; - if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { - trans = sctp_addr_id2transport(sk, &val.spt_address, - val.spt_assoc_id); + if (!sctp_is_any(sk, (const union sctp_addr *)&val->spt_address)) { + trans = sctp_addr_id2transport(sk, &val->spt_address, + val->spt_assoc_id); if (!trans) return -ENOENT; - if (val.spt_pathmaxrxt) - trans->pathmaxrxt = val.spt_pathmaxrxt; + if (val->spt_pathmaxrxt) + trans->pathmaxrxt = val->spt_pathmaxrxt; if (v2) - trans->ps_retrans = val.spt_pathcpthld; - trans->pf_retrans = val.spt_pathpfthld; + trans->ps_retrans = val->spt_pathcpthld; + trans->pf_retrans = val->spt_pathpfthld; return 0; } - asoc = sctp_id2assoc(sk, val.spt_assoc_id); - if (!asoc && val.spt_assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, val->spt_assoc_id); + if (!asoc && val->spt_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { - if (val.spt_pathmaxrxt) - trans->pathmaxrxt = val.spt_pathmaxrxt; + if (val->spt_pathmaxrxt) + trans->pathmaxrxt = val->spt_pathmaxrxt; if (v2) - trans->ps_retrans = val.spt_pathcpthld; - trans->pf_retrans = val.spt_pathpfthld; + trans->ps_retrans = val->spt_pathcpthld; + trans->pf_retrans = val->spt_pathpfthld; } - if (val.spt_pathmaxrxt) - asoc->pathmaxrxt = val.spt_pathmaxrxt; + if (val->spt_pathmaxrxt) + asoc->pathmaxrxt = val->spt_pathmaxrxt; if (v2) - asoc->ps_retrans = val.spt_pathcpthld; - asoc->pf_retrans = val.spt_pathpfthld; + asoc->ps_retrans = val->spt_pathcpthld; + asoc->pf_retrans = val->spt_pathpfthld; } else { struct sctp_sock *sp = sctp_sk(sk); - if (val.spt_pathmaxrxt) - sp->pathmaxrxt = val.spt_pathmaxrxt; + if (val->spt_pathmaxrxt) + sp->pathmaxrxt = val->spt_pathmaxrxt; if (v2) - sp->ps_retrans = val.spt_pathcpthld; - sp->pf_retrans = val.spt_pathpfthld; + sp->ps_retrans = val->spt_pathcpthld; + sp->pf_retrans = val->spt_pathpfthld; } return 0; } -static int sctp_setsockopt_recvrcvinfo(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_recvrcvinfo(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *) optval)) - return -EFAULT; - sctp_sk(sk)->recvrcvinfo = (val == 0) ? 0 : 1; + sctp_sk(sk)->recvrcvinfo = (*val == 0) ? 0 : 1; return 0; } -static int sctp_setsockopt_recvnxtinfo(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_recvnxtinfo(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *) optval)) - return -EFAULT; - sctp_sk(sk)->recvnxtinfo = (val == 0) ? 0 : 1; + sctp_sk(sk)->recvnxtinfo = (*val == 0) ? 0 : 1; return 0; } static int sctp_setsockopt_pr_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; - if (optlen != sizeof(params)) + if (optlen != sizeof(*params)) return -EINVAL; - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; - sctp_sk(sk)->ep->prsctp_enable = !!params.assoc_value; + sctp_sk(sk)->ep->prsctp_enable = !!params->assoc_value; return 0; } static int sctp_setsockopt_default_prinfo(struct sock *sk, - char __user *optval, + struct sctp_default_prinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); - struct sctp_default_prinfo info; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen != sizeof(info)) + if (optlen != sizeof(*info)) goto out; - if (copy_from_user(&info, optval, sizeof(info))) { - retval = -EFAULT; + if (info->pr_policy & ~SCTP_PR_SCTP_MASK) goto out; - } - if (info.pr_policy & ~SCTP_PR_SCTP_MASK) - goto out; + if (info->pr_policy == SCTP_PR_SCTP_NONE) + info->pr_value = 0; - if (info.pr_policy == SCTP_PR_SCTP_NONE) - info.pr_value = 0; - - asoc = sctp_id2assoc(sk, info.pr_assoc_id); - if (!asoc && info.pr_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, info->pr_assoc_id); + if (!asoc && info->pr_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) goto out; retval = 0; if (asoc) { - SCTP_PR_SET_POLICY(asoc->default_flags, info.pr_policy); - asoc->default_timetolive = info.pr_value; + SCTP_PR_SET_POLICY(asoc->default_flags, info->pr_policy); + asoc->default_timetolive = info->pr_value; goto out; } if (sctp_style(sk, TCP)) - info.pr_assoc_id = SCTP_FUTURE_ASSOC; + info->pr_assoc_id = SCTP_FUTURE_ASSOC; - if (info.pr_assoc_id == SCTP_FUTURE_ASSOC || - info.pr_assoc_id == SCTP_ALL_ASSOC) { - SCTP_PR_SET_POLICY(sp->default_flags, info.pr_policy); - sp->default_timetolive = info.pr_value; + if (info->pr_assoc_id == SCTP_FUTURE_ASSOC || + info->pr_assoc_id == SCTP_ALL_ASSOC) { + SCTP_PR_SET_POLICY(sp->default_flags, info->pr_policy); + sp->default_timetolive = info->pr_value; } - if (info.pr_assoc_id == SCTP_CURRENT_ASSOC || - info.pr_assoc_id == SCTP_ALL_ASSOC) { + if (info->pr_assoc_id == SCTP_CURRENT_ASSOC || + info->pr_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { - SCTP_PR_SET_POLICY(asoc->default_flags, info.pr_policy); - asoc->default_timetolive = info.pr_value; + SCTP_PR_SET_POLICY(asoc->default_flags, + info->pr_policy); + asoc->default_timetolive = info->pr_value; } } @@ -4140,27 +4004,21 @@ out: } static int sctp_setsockopt_reconfig_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen != sizeof(params)) - goto out; - - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; + if (optlen != sizeof(*params)) goto out; - } - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; - sctp_sk(sk)->ep->reconf_enable = !!params.assoc_value; + sctp_sk(sk)->ep->reconf_enable = !!params->assoc_value; retval = 0; @@ -4169,60 +4027,52 @@ out: } static int sctp_setsockopt_enable_strreset(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; - struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen != sizeof(params)) + if (optlen != sizeof(*params)) goto out; - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } - - if (params.assoc_value & (~SCTP_ENABLE_STRRESET_MASK)) + if (params->assoc_value & (~SCTP_ENABLE_STRRESET_MASK)) goto out; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) goto out; retval = 0; if (asoc) { - asoc->strreset_enable = params.assoc_value; + asoc->strreset_enable = params->assoc_value; goto out; } if (sctp_style(sk, TCP)) - params.assoc_id = SCTP_FUTURE_ASSOC; + params->assoc_id = SCTP_FUTURE_ASSOC; - if (params.assoc_id == SCTP_FUTURE_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) - ep->strreset_enable = params.assoc_value; + if (params->assoc_id == SCTP_FUTURE_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) + ep->strreset_enable = params->assoc_value; - if (params.assoc_id == SCTP_CURRENT_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) + if (params->assoc_id == SCTP_CURRENT_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &ep->asocs, asocs) - asoc->strreset_enable = params.assoc_value; + asoc->strreset_enable = params->assoc_value; out: return retval; } static int sctp_setsockopt_reset_streams(struct sock *sk, - char __user *optval, + struct sctp_reset_streams *params, unsigned int optlen) { - struct sctp_reset_streams *params; struct sctp_association *asoc; - int retval = -EINVAL; if (optlen < sizeof(*params)) return -EINVAL; @@ -4230,116 +4080,82 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, optlen = min_t(unsigned int, optlen, USHRT_MAX + sizeof(__u16) * sizeof(*params)); - params = memdup_user(optval, optlen); - if (IS_ERR(params)) - return PTR_ERR(params); - if (params->srs_number_streams * sizeof(__u16) > optlen - sizeof(*params)) - goto out; + return -EINVAL; asoc = sctp_id2assoc(sk, params->srs_assoc_id); if (!asoc) - goto out; - - retval = sctp_send_reset_streams(asoc, params); + return -EINVAL; -out: - kfree(params); - return retval; + return sctp_send_reset_streams(asoc, params); } -static int sctp_setsockopt_reset_assoc(struct sock *sk, - char __user *optval, +static int sctp_setsockopt_reset_assoc(struct sock *sk, sctp_assoc_t *associd, unsigned int optlen) { struct sctp_association *asoc; - sctp_assoc_t associd; - int retval = -EINVAL; - if (optlen != sizeof(associd)) - goto out; - - if (copy_from_user(&associd, optval, optlen)) { - retval = -EFAULT; - goto out; - } + if (optlen != sizeof(*associd)) + return -EINVAL; - asoc = sctp_id2assoc(sk, associd); + asoc = sctp_id2assoc(sk, *associd); if (!asoc) - goto out; - - retval = sctp_send_reset_assoc(asoc); + return -EINVAL; -out: - return retval; + return sctp_send_reset_assoc(asoc); } static int sctp_setsockopt_add_streams(struct sock *sk, - char __user *optval, + struct sctp_add_streams *params, unsigned int optlen) { struct sctp_association *asoc; - struct sctp_add_streams params; - int retval = -EINVAL; - if (optlen != sizeof(params)) - goto out; - - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } + if (optlen != sizeof(*params)) + return -EINVAL; - asoc = sctp_id2assoc(sk, params.sas_assoc_id); + asoc = sctp_id2assoc(sk, params->sas_assoc_id); if (!asoc) - goto out; - - retval = sctp_send_add_streams(asoc, ¶ms); + return -EINVAL; -out: - return retval; + return sctp_send_add_streams(asoc, params); } static int sctp_setsockopt_scheduler(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_assoc_value params; int retval = 0; - if (optlen < sizeof(params)) + if (optlen < sizeof(*params)) return -EINVAL; - optlen = sizeof(params); - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; - - if (params.assoc_value > SCTP_SS_MAX) + if (params->assoc_value > SCTP_SS_MAX) return -EINVAL; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) - return sctp_sched_set_sched(asoc, params.assoc_value); + return sctp_sched_set_sched(asoc, params->assoc_value); if (sctp_style(sk, TCP)) - params.assoc_id = SCTP_FUTURE_ASSOC; + params->assoc_id = SCTP_FUTURE_ASSOC; - if (params.assoc_id == SCTP_FUTURE_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) - sp->default_ss = params.assoc_value; + if (params->assoc_id == SCTP_FUTURE_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) + sp->default_ss = params->assoc_value; - if (params.assoc_id == SCTP_CURRENT_ASSOC || - params.assoc_id == SCTP_ALL_ASSOC) { + if (params->assoc_id == SCTP_CURRENT_ASSOC || + params->assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { int ret = sctp_sched_set_sched(asoc, - params.assoc_value); + params->assoc_value); if (ret && !retval) retval = ret; @@ -4350,38 +4166,32 @@ static int sctp_setsockopt_scheduler(struct sock *sk, } static int sctp_setsockopt_scheduler_value(struct sock *sk, - char __user *optval, + struct sctp_stream_value *params, unsigned int optlen) { - struct sctp_stream_value params; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen < sizeof(params)) - goto out; - - optlen = sizeof(params); - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; + if (optlen < sizeof(*params)) goto out; - } - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_CURRENT_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_CURRENT_ASSOC && sctp_style(sk, UDP)) goto out; if (asoc) { - retval = sctp_sched_set_value(asoc, params.stream_id, - params.stream_value, GFP_KERNEL); + retval = sctp_sched_set_value(asoc, params->stream_id, + params->stream_value, GFP_KERNEL); goto out; } retval = 0; list_for_each_entry(asoc, &sctp_sk(sk)->ep->asocs, asocs) { - int ret = sctp_sched_set_value(asoc, params.stream_id, - params.stream_value, GFP_KERNEL); + int ret = sctp_sched_set_value(asoc, params->stream_id, + params->stream_value, + GFP_KERNEL); if (ret && !retval) /* try to return the 1st error. */ retval = ret; } @@ -4391,46 +4201,30 @@ out: } static int sctp_setsockopt_interleaving_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *p, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); - struct sctp_assoc_value params; struct sctp_association *asoc; - int retval = -EINVAL; - if (optlen < sizeof(params)) - goto out; - - optlen = sizeof(params); - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } + if (optlen < sizeof(*p)) + return -EINVAL; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && - sctp_style(sk, UDP)) - goto out; + asoc = sctp_id2assoc(sk, p->assoc_id); + if (!asoc && p->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) + return -EINVAL; if (!sock_net(sk)->sctp.intl_enable || !sp->frag_interleave) { - retval = -EPERM; - goto out; + return -EPERM; } - sp->ep->intl_enable = !!params.assoc_value; - - retval = 0; - -out: - return retval; + sp->ep->intl_enable = !!p->assoc_value; + return 0; } -static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval, +static int sctp_setsockopt_reuse_port(struct sock *sk, int *val, unsigned int optlen) { - int val; - if (!sctp_style(sk, TCP)) return -EOPNOTSUPP; @@ -4440,10 +4234,7 @@ static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) - return -EFAULT; - - sctp_sk(sk)->reuse = !!val; + sctp_sk(sk)->reuse = !!*val; return 0; } @@ -4469,45 +4260,40 @@ static int sctp_assoc_ulpevent_type_set(struct sctp_event *param, return 0; } -static int sctp_setsockopt_event(struct sock *sk, char __user *optval, +static int sctp_setsockopt_event(struct sock *sk, struct sctp_event *param, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; - struct sctp_event param; int retval = 0; - if (optlen < sizeof(param)) + if (optlen < sizeof(*param)) return -EINVAL; - optlen = sizeof(param); - if (copy_from_user(¶m, optval, optlen)) - return -EFAULT; - - if (param.se_type < SCTP_SN_TYPE_BASE || - param.se_type > SCTP_SN_TYPE_MAX) + if (param->se_type < SCTP_SN_TYPE_BASE || + param->se_type > SCTP_SN_TYPE_MAX) return -EINVAL; - asoc = sctp_id2assoc(sk, param.se_assoc_id); - if (!asoc && param.se_assoc_id > SCTP_ALL_ASSOC && + asoc = sctp_id2assoc(sk, param->se_assoc_id); + if (!asoc && param->se_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) - return sctp_assoc_ulpevent_type_set(¶m, asoc); + return sctp_assoc_ulpevent_type_set(param, asoc); if (sctp_style(sk, TCP)) - param.se_assoc_id = SCTP_FUTURE_ASSOC; + param->se_assoc_id = SCTP_FUTURE_ASSOC; - if (param.se_assoc_id == SCTP_FUTURE_ASSOC || - param.se_assoc_id == SCTP_ALL_ASSOC) + if (param->se_assoc_id == SCTP_FUTURE_ASSOC || + param->se_assoc_id == SCTP_ALL_ASSOC) sctp_ulpevent_type_set(&sp->subscribe, - param.se_type, param.se_on); + param->se_type, param->se_on); - if (param.se_assoc_id == SCTP_CURRENT_ASSOC || - param.se_assoc_id == SCTP_ALL_ASSOC) { + if (param->se_assoc_id == SCTP_CURRENT_ASSOC || + param->se_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { - int ret = sctp_assoc_ulpevent_type_set(¶m, asoc); + int ret = sctp_assoc_ulpevent_type_set(param, asoc); if (ret && !retval) retval = ret; @@ -4518,29 +4304,23 @@ static int sctp_setsockopt_event(struct sock *sk, char __user *optval, } static int sctp_setsockopt_asconf_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; struct sctp_endpoint *ep; int retval = -EINVAL; - if (optlen != sizeof(params)) + if (optlen != sizeof(*params)) goto out; - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } - - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; ep = sctp_sk(sk)->ep; - ep->asconf_enable = !!params.assoc_value; + ep->asconf_enable = !!params->assoc_value; if (ep->asconf_enable && ep->auth_enable) { sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF); @@ -4554,29 +4334,23 @@ out: } static int sctp_setsockopt_auth_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; struct sctp_endpoint *ep; int retval = -EINVAL; - if (optlen != sizeof(params)) - goto out; - - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; + if (optlen != sizeof(*params)) goto out; - } - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; ep = sctp_sk(sk)->ep; - if (params.assoc_value) { + if (params->assoc_value) { retval = sctp_auth_init(ep, GFP_KERNEL); if (retval) goto out; @@ -4586,7 +4360,7 @@ static int sctp_setsockopt_auth_supported(struct sock *sk, } } - ep->auth_enable = !!params.assoc_value; + ep->auth_enable = !!params->assoc_value; retval = 0; out: @@ -4594,27 +4368,21 @@ out: } static int sctp_setsockopt_ecn_supported(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen != sizeof(params)) - goto out; - - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; + if (optlen != sizeof(*params)) goto out; - } - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; - sctp_sk(sk)->ep->ecn_enable = !!params.assoc_value; + sctp_sk(sk)->ep->ecn_enable = !!params->assoc_value; retval = 0; out: @@ -4622,33 +4390,27 @@ out: } static int sctp_setsockopt_pf_expose(struct sock *sk, - char __user *optval, + struct sctp_assoc_value *params, unsigned int optlen) { - struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EINVAL; - if (optlen != sizeof(params)) + if (optlen != sizeof(*params)) goto out; - if (copy_from_user(¶ms, optval, optlen)) { - retval = -EFAULT; - goto out; - } - - if (params.assoc_value > SCTP_PF_EXPOSE_MAX) + if (params->assoc_value > SCTP_PF_EXPOSE_MAX) goto out; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && + asoc = sctp_id2assoc(sk, params->assoc_id); + if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; if (asoc) - asoc->pf_expose = params.assoc_value; + asoc->pf_expose = params->assoc_value; else - sctp_sk(sk)->pf_expose = params.assoc_value; + sctp_sk(sk)->pf_expose = params->assoc_value; retval = 0; out: @@ -4675,8 +4437,9 @@ out: * optlen - the size of the buffer. */ static int sctp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { + void *kopt = NULL; int retval = 0; pr_debug("%s: sk:%p, optname:%d\n", __func__, sk, optname); @@ -4689,8 +4452,14 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, */ if (level != SOL_SCTP) { struct sctp_af *af = sctp_sk(sk)->pf->af; - retval = af->setsockopt(sk, level, optname, optval, optlen); - goto out_nounlock; + + return af->setsockopt(sk, level, optname, optval, optlen); + } + + if (optlen > 0) { + kopt = memdup_sockptr(optval, optlen); + if (IS_ERR(kopt)) + return PTR_ERR(kopt); } lock_sock(sk); @@ -4698,179 +4467,174 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case SCTP_SOCKOPT_BINDX_ADD: /* 'optlen' is the size of the addresses buffer. */ - retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval, - optlen, SCTP_BINDX_ADD_ADDR); + retval = sctp_setsockopt_bindx(sk, kopt, optlen, + SCTP_BINDX_ADD_ADDR); break; case SCTP_SOCKOPT_BINDX_REM: /* 'optlen' is the size of the addresses buffer. */ - retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval, - optlen, SCTP_BINDX_REM_ADDR); + retval = sctp_setsockopt_bindx(sk, kopt, optlen, + SCTP_BINDX_REM_ADDR); break; case SCTP_SOCKOPT_CONNECTX_OLD: /* 'optlen' is the size of the addresses buffer. */ - retval = sctp_setsockopt_connectx_old(sk, - (struct sockaddr __user *)optval, - optlen); + retval = sctp_setsockopt_connectx_old(sk, kopt, optlen); break; case SCTP_SOCKOPT_CONNECTX: /* 'optlen' is the size of the addresses buffer. */ - retval = sctp_setsockopt_connectx(sk, - (struct sockaddr __user *)optval, - optlen); + retval = sctp_setsockopt_connectx(sk, kopt, optlen); break; case SCTP_DISABLE_FRAGMENTS: - retval = sctp_setsockopt_disable_fragments(sk, optval, optlen); + retval = sctp_setsockopt_disable_fragments(sk, kopt, optlen); break; case SCTP_EVENTS: - retval = sctp_setsockopt_events(sk, optval, optlen); + retval = sctp_setsockopt_events(sk, kopt, optlen); break; case SCTP_AUTOCLOSE: - retval = sctp_setsockopt_autoclose(sk, optval, optlen); + retval = sctp_setsockopt_autoclose(sk, kopt, optlen); break; case SCTP_PEER_ADDR_PARAMS: - retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen); + retval = sctp_setsockopt_peer_addr_params(sk, kopt, optlen); break; case SCTP_DELAYED_SACK: - retval = sctp_setsockopt_delayed_ack(sk, optval, optlen); + retval = sctp_setsockopt_delayed_ack(sk, kopt, optlen); break; case SCTP_PARTIAL_DELIVERY_POINT: - retval = sctp_setsockopt_partial_delivery_point(sk, optval, optlen); + retval = sctp_setsockopt_partial_delivery_point(sk, kopt, optlen); break; case SCTP_INITMSG: - retval = sctp_setsockopt_initmsg(sk, optval, optlen); + retval = sctp_setsockopt_initmsg(sk, kopt, optlen); break; case SCTP_DEFAULT_SEND_PARAM: - retval = sctp_setsockopt_default_send_param(sk, optval, - optlen); + retval = sctp_setsockopt_default_send_param(sk, kopt, optlen); break; case SCTP_DEFAULT_SNDINFO: - retval = sctp_setsockopt_default_sndinfo(sk, optval, optlen); + retval = sctp_setsockopt_default_sndinfo(sk, kopt, optlen); break; case SCTP_PRIMARY_ADDR: - retval = sctp_setsockopt_primary_addr(sk, optval, optlen); + retval = sctp_setsockopt_primary_addr(sk, kopt, optlen); break; case SCTP_SET_PEER_PRIMARY_ADDR: - retval = sctp_setsockopt_peer_primary_addr(sk, optval, optlen); + retval = sctp_setsockopt_peer_primary_addr(sk, kopt, optlen); break; case SCTP_NODELAY: - retval = sctp_setsockopt_nodelay(sk, optval, optlen); + retval = sctp_setsockopt_nodelay(sk, kopt, optlen); break; case SCTP_RTOINFO: - retval = sctp_setsockopt_rtoinfo(sk, optval, optlen); + retval = sctp_setsockopt_rtoinfo(sk, kopt, optlen); break; case SCTP_ASSOCINFO: - retval = sctp_setsockopt_associnfo(sk, optval, optlen); + retval = sctp_setsockopt_associnfo(sk, kopt, optlen); break; case SCTP_I_WANT_MAPPED_V4_ADDR: - retval = sctp_setsockopt_mappedv4(sk, optval, optlen); + retval = sctp_setsockopt_mappedv4(sk, kopt, optlen); break; case SCTP_MAXSEG: - retval = sctp_setsockopt_maxseg(sk, optval, optlen); + retval = sctp_setsockopt_maxseg(sk, kopt, optlen); break; case SCTP_ADAPTATION_LAYER: - retval = sctp_setsockopt_adaptation_layer(sk, optval, optlen); + retval = sctp_setsockopt_adaptation_layer(sk, kopt, optlen); break; case SCTP_CONTEXT: - retval = sctp_setsockopt_context(sk, optval, optlen); + retval = sctp_setsockopt_context(sk, kopt, optlen); break; case SCTP_FRAGMENT_INTERLEAVE: - retval = sctp_setsockopt_fragment_interleave(sk, optval, optlen); + retval = sctp_setsockopt_fragment_interleave(sk, kopt, optlen); break; case SCTP_MAX_BURST: - retval = sctp_setsockopt_maxburst(sk, optval, optlen); + retval = sctp_setsockopt_maxburst(sk, kopt, optlen); break; case SCTP_AUTH_CHUNK: - retval = sctp_setsockopt_auth_chunk(sk, optval, optlen); + retval = sctp_setsockopt_auth_chunk(sk, kopt, optlen); break; case SCTP_HMAC_IDENT: - retval = sctp_setsockopt_hmac_ident(sk, optval, optlen); + retval = sctp_setsockopt_hmac_ident(sk, kopt, optlen); break; case SCTP_AUTH_KEY: - retval = sctp_setsockopt_auth_key(sk, optval, optlen); + retval = sctp_setsockopt_auth_key(sk, kopt, optlen); break; case SCTP_AUTH_ACTIVE_KEY: - retval = sctp_setsockopt_active_key(sk, optval, optlen); + retval = sctp_setsockopt_active_key(sk, kopt, optlen); break; case SCTP_AUTH_DELETE_KEY: - retval = sctp_setsockopt_del_key(sk, optval, optlen); + retval = sctp_setsockopt_del_key(sk, kopt, optlen); break; case SCTP_AUTH_DEACTIVATE_KEY: - retval = sctp_setsockopt_deactivate_key(sk, optval, optlen); + retval = sctp_setsockopt_deactivate_key(sk, kopt, optlen); break; case SCTP_AUTO_ASCONF: - retval = sctp_setsockopt_auto_asconf(sk, optval, optlen); + retval = sctp_setsockopt_auto_asconf(sk, kopt, optlen); break; case SCTP_PEER_ADDR_THLDS: - retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + retval = sctp_setsockopt_paddr_thresholds(sk, kopt, optlen, false); break; case SCTP_PEER_ADDR_THLDS_V2: - retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen, + retval = sctp_setsockopt_paddr_thresholds(sk, kopt, optlen, true); break; case SCTP_RECVRCVINFO: - retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen); + retval = sctp_setsockopt_recvrcvinfo(sk, kopt, optlen); break; case SCTP_RECVNXTINFO: - retval = sctp_setsockopt_recvnxtinfo(sk, optval, optlen); + retval = sctp_setsockopt_recvnxtinfo(sk, kopt, optlen); break; case SCTP_PR_SUPPORTED: - retval = sctp_setsockopt_pr_supported(sk, optval, optlen); + retval = sctp_setsockopt_pr_supported(sk, kopt, optlen); break; case SCTP_DEFAULT_PRINFO: - retval = sctp_setsockopt_default_prinfo(sk, optval, optlen); + retval = sctp_setsockopt_default_prinfo(sk, kopt, optlen); break; case SCTP_RECONFIG_SUPPORTED: - retval = sctp_setsockopt_reconfig_supported(sk, optval, optlen); + retval = sctp_setsockopt_reconfig_supported(sk, kopt, optlen); break; case SCTP_ENABLE_STREAM_RESET: - retval = sctp_setsockopt_enable_strreset(sk, optval, optlen); + retval = sctp_setsockopt_enable_strreset(sk, kopt, optlen); break; case SCTP_RESET_STREAMS: - retval = sctp_setsockopt_reset_streams(sk, optval, optlen); + retval = sctp_setsockopt_reset_streams(sk, kopt, optlen); break; case SCTP_RESET_ASSOC: - retval = sctp_setsockopt_reset_assoc(sk, optval, optlen); + retval = sctp_setsockopt_reset_assoc(sk, kopt, optlen); break; case SCTP_ADD_STREAMS: - retval = sctp_setsockopt_add_streams(sk, optval, optlen); + retval = sctp_setsockopt_add_streams(sk, kopt, optlen); break; case SCTP_STREAM_SCHEDULER: - retval = sctp_setsockopt_scheduler(sk, optval, optlen); + retval = sctp_setsockopt_scheduler(sk, kopt, optlen); break; case SCTP_STREAM_SCHEDULER_VALUE: - retval = sctp_setsockopt_scheduler_value(sk, optval, optlen); + retval = sctp_setsockopt_scheduler_value(sk, kopt, optlen); break; case SCTP_INTERLEAVING_SUPPORTED: - retval = sctp_setsockopt_interleaving_supported(sk, optval, + retval = sctp_setsockopt_interleaving_supported(sk, kopt, optlen); break; case SCTP_REUSE_PORT: - retval = sctp_setsockopt_reuse_port(sk, optval, optlen); + retval = sctp_setsockopt_reuse_port(sk, kopt, optlen); break; case SCTP_EVENT: - retval = sctp_setsockopt_event(sk, optval, optlen); + retval = sctp_setsockopt_event(sk, kopt, optlen); break; case SCTP_ASCONF_SUPPORTED: - retval = sctp_setsockopt_asconf_supported(sk, optval, optlen); + retval = sctp_setsockopt_asconf_supported(sk, kopt, optlen); break; case SCTP_AUTH_SUPPORTED: - retval = sctp_setsockopt_auth_supported(sk, optval, optlen); + retval = sctp_setsockopt_auth_supported(sk, kopt, optlen); break; case SCTP_ECN_SUPPORTED: - retval = sctp_setsockopt_ecn_supported(sk, optval, optlen); + retval = sctp_setsockopt_ecn_supported(sk, kopt, optlen); break; case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: - retval = sctp_setsockopt_pf_expose(sk, optval, optlen); + retval = sctp_setsockopt_pf_expose(sk, kopt, optlen); break; default: retval = -ENOPROTOOPT; @@ -4878,8 +4642,7 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, } release_sock(sk); - -out_nounlock: + kfree(kopt); return retval; } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1163d51196da..e7649bbc2b87 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -719,8 +719,11 @@ static int smc_connect_ism(struct smc_sock *smc, } /* Create send and receive buffers */ - if (smc_buf_create(smc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, + rc = smc_buf_create(smc, true); + if (rc) + return smc_connect_abort(smc, (rc == -ENOSPC) ? + SMC_CLC_DECL_MAX_DMB : + SMC_CLC_DECL_MEM, ini->cln_first_contact); smc_conn_save_peer_info(smc, aclc); @@ -1200,12 +1203,14 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, } /* Create send and receive buffers */ - if (smc_buf_create(new_smc, true)) { + rc = smc_buf_create(new_smc, true); + if (rc) { if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_lgr_cleanup_early(&new_smc->conn); else smc_conn_free(&new_smc->conn); - return SMC_CLC_DECL_MEM; + return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : + SMC_CLC_DECL_MEM; } return 0; @@ -1735,7 +1740,7 @@ out: } static int smc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -1746,8 +1751,11 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, /* generic setsockopts reaching us here always apply to the * CLC socket */ - rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, - optval, optlen); + if (unlikely(!smc->clcsock->ops->setsockopt)) + rc = -EOPNOTSUPP; + else + rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); if (smc->clcsock->sk->sk_err) { sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); @@ -1755,7 +1763,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); @@ -1812,6 +1820,8 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, smc = smc_sk(sock->sk); /* socket options apply to the CLC socket */ + if (unlikely(!smc->clcsock->ops->getsockopt)) + return -EOPNOTSUPP; return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 76c2b150d040..cf7b45306f4e 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -48,6 +48,7 @@ #define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ #define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ #define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */ +#define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f82a2e599917..b42fa3b00d00 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1614,7 +1614,7 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); - return ERR_PTR(-EAGAIN); + return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : ERR_PTR(rc); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ @@ -1688,7 +1688,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) } if (IS_ERR(buf_desc)) - return -ENOMEM; + return PTR_ERR(buf_desc); if (!is_smcd) { if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { diff --git a/net/socket.c b/net/socket.c index 976426d03f09..aff52e81653c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -586,15 +586,6 @@ struct socket *sock_alloc(void) } EXPORT_SYMBOL(sock_alloc); -/** - * sock_release - close a socket - * @sock: socket to close - * - * The socket is released from the protocol stack if it has a release - * callback, and the inode is then released if the socket is bound to - * an inode not a file. - */ - static void __sock_release(struct socket *sock, struct inode *inode) { if (sock->ops) { @@ -620,6 +611,14 @@ static void __sock_release(struct socket *sock, struct inode *inode) sock->file = NULL; } +/** + * sock_release - close a socket + * @sock: socket to close + * + * The socket is released from the protocol stack if it has a release + * callback, and the inode is then released if the socket is bound to + * an inode not a file. + */ void sock_release(struct socket *sock) { __sock_release(sock, NULL); @@ -2080,15 +2079,25 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); } +static bool sock_use_custom_sol_socket(const struct socket *sock) +{ + const struct sock *sk = sock->sk; + + /* Use sock->ops->setsockopt() for MPTCP */ + return IS_ENABLED(CONFIG_MPTCP) && + sk->sk_protocol == IPPROTO_MPTCP && + sk->sk_type == SOCK_STREAM && + (sk->sk_family == AF_INET || sk->sk_family == AF_INET6); +} + /* * Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. */ - -static int __sys_setsockopt(int fd, int level, int optname, - char __user *optval, int optlen) +int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, + int optlen) { - mm_segment_t oldfs = get_fs(); + sockptr_t optval; char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2096,44 +2105,41 @@ static int __sys_setsockopt(int fd, int level, int optname, if (optlen < 0) return -EINVAL; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (sock != NULL) { - err = security_socket_setsockopt(sock, level, optname); - if (err) - goto out_put; + err = init_user_sockptr(&optval, user_optval, optlen); + if (err) + return err; - err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, - &optname, optval, &optlen, - &kernel_optval); + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + return err; - if (err < 0) { - goto out_put; - } else if (err > 0) { - err = 0; - goto out_put; - } + err = security_socket_setsockopt(sock, level, optname); + if (err) + goto out_put; - if (kernel_optval) { - set_fs(KERNEL_DS); - optval = (char __user __force *)kernel_optval; - } + if (!in_compat_syscall()) + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, + user_optval, &optlen, + &kernel_optval); + if (err < 0) + goto out_put; + if (err > 0) { + err = 0; + goto out_put; + } - if (level == SOL_SOCKET) - err = - sock_setsockopt(sock, level, optname, optval, + if (kernel_optval) + optval = KERNEL_SOCKPTR(kernel_optval); + if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock)) + err = sock_setsockopt(sock, level, optname, optval, optlen); + else if (unlikely(!sock->ops->setsockopt)) + err = -EOPNOTSUPP; + else + err = sock->ops->setsockopt(sock, level, optname, optval, optlen); - else - err = - sock->ops->setsockopt(sock, level, optname, optval, - optlen); - - if (kernel_optval) { - set_fs(oldfs); - kfree(kernel_optval); - } + kfree(kernel_optval); out_put: - fput_light(sock->file, fput_needed); - } + fput_light(sock->file, fput_needed); return err; } @@ -2147,37 +2153,38 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. */ - -static int __sys_getsockopt(int fd, int level, int optname, - char __user *optval, int __user *optlen) +int __sys_getsockopt(int fd, int level, int optname, char __user *optval, + int __user *optlen) { int err, fput_needed; struct socket *sock; int max_optlen; sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (sock != NULL) { - err = security_socket_getsockopt(sock, level, optname); - if (err) - goto out_put; + if (!sock) + return err; + err = security_socket_getsockopt(sock, level, optname); + if (err) + goto out_put; + + if (!in_compat_syscall()) max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); - if (level == SOL_SOCKET) - err = - sock_getsockopt(sock, level, optname, optval, + if (level == SOL_SOCKET) + err = sock_getsockopt(sock, level, optname, optval, optlen); + else if (unlikely(!sock->ops->getsockopt)) + err = -EOPNOTSUPP; + else + err = sock->ops->getsockopt(sock, level, optname, optval, optlen); - else - err = - sock->ops->getsockopt(sock, level, optname, optval, - optlen); + if (!in_compat_syscall()) err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, - optval, optlen, - max_optlen, err); + optval, optlen, max_optlen, + err); out_put: - fput_light(sock->file, fput_needed); - } + fput_light(sock->file, fput_needed); return err; } diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index f25604d68337..865f3e037425 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -304,8 +304,8 @@ static int switchdev_port_obj_add_defer(struct net_device *dev, * switchdev_port_obj_add - Add port object * * @dev: port device - * @id: object ID * @obj: object to add + * @extack: netlink extended ack * * Use a 2-phase prepare-commit transaction model to ensure * system is not left in a partially updated state due to @@ -357,7 +357,6 @@ static int switchdev_port_obj_del_defer(struct net_device *dev, * switchdev_port_obj_del - Delete port object * * @dev: port device - * @id: object ID * @obj: object to delete * * rtnl_lock must be held and must not be in atomic section, diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 383f87bc1061..940d176e0e87 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -250,8 +250,8 @@ static void tipc_bcast_select_xmit_method(struct net *net, int dests, * Consumes the buffer chain. * Returns 0 if success, otherwise errno: -EHOSTUNREACH,-EMSGSIZE */ -static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, - u16 *cong_link_cnt) +int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, + u16 *cong_link_cnt) { struct tipc_link *l = tipc_bc_sndlink(net); struct sk_buff_head xmitq; @@ -752,7 +752,7 @@ void tipc_nlist_purge(struct tipc_nlist *nl) nl->local = false; } -u32 tipc_bcast_get_broadcast_mode(struct net *net) +u32 tipc_bcast_get_mode(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 4240c95188b1..2d9352dc7b0e 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -90,6 +90,8 @@ void tipc_bcast_toggle_rcast(struct net *net, bool supp); int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_mc_method *method, struct tipc_nlist *dests, u16 *cong_link_cnt); +int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, + u16 *cong_link_cnt); int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr); @@ -101,7 +103,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg, int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l); -u32 tipc_bcast_get_broadcast_mode(struct net *net); +u32 tipc_bcast_get_mode(struct net *net); u32 tipc_bcast_get_broadcast_ratio(struct net *net); void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index e366ec9a7e4d..808b147df7d5 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -595,7 +595,7 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, /** * tipc_l2_rcv_msg - handle incoming TIPC message from an interface - * @buf: the received packet + * @skb: the received message * @dev: the net device that the packet was received on * @pt: the packet_type structure which was used to register this handler * @orig_dev: the original receive net device in case the device is a bond diff --git a/net/tipc/discover.c b/net/tipc/discover.c index bfe43da127c0..d4ecacddb40c 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -74,7 +74,7 @@ struct tipc_discoverer { /** * tipc_disc_init_msg - initialize a link setup message * @net: the applicable net namespace - * @type: message type (request or response) + * @mtyp: message type (request or response) * @b: ptr to bearer issuing message */ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, @@ -339,7 +339,7 @@ exit: * @net: the applicable net namespace * @b: ptr to bearer issuing requests * @dest: destination address for request messages - * @dest_domain: network domain to which links can be established + * @skb: pointer to created frame * * Returns 0 if successful, otherwise -errno. */ @@ -393,7 +393,6 @@ void tipc_disc_delete(struct tipc_discoverer *d) * tipc_disc_reset - reset object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests - * @dest_domain: network domain to which links can be established */ void tipc_disc_reset(struct net *net, struct tipc_bearer *b) { diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 8b0bb600602d..c68019697cfe 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -62,12 +62,10 @@ static int tipc_eth_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg) { - char bcast_mac[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - memset(addr, 0, sizeof(*addr)); ether_addr_copy(addr->value, msg); addr->media_id = TIPC_MEDIA_TYPE_ETH; - addr->broadcast = !memcmp(addr->value, bcast_mac, ETH_ALEN); + addr->broadcast = is_broadcast_ether_addr(addr->value); return 0; } diff --git a/net/tipc/link.c b/net/tipc/link.c index d40f8e5b7683..107578122973 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -445,7 +445,7 @@ u32 tipc_link_state(struct tipc_link *l) /** * tipc_link_create - create a new link - * @n: pointer to associated node + * @net: pointer to associated network namespace * @if_name: associated interface name * @bearer_id: id (index) of associated bearer * @tolerance: link tolerance to be used by link @@ -530,7 +530,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, /** * tipc_link_bc_create - create new link to be used for broadcast - * @n: pointer to associated node + * @net: pointer to associated network namespace * @mtu: mtu to be used initially if no peers * @window: send window to be used * @inputq: queue to put messages ready for delivery @@ -989,7 +989,7 @@ void tipc_link_reset(struct tipc_link *l) /** * tipc_link_xmit(): enqueue buffer list according to queue situation - * @link: link to use + * @l: link to use * @list: chain of buffers containing message * @xmitq: returned list of packets to be sent by caller * @@ -1396,12 +1396,12 @@ u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l, p = (struct tipc_gap_ack_blks *)msg_data(hdr); sz = ntohs(p->len); /* Sanity check */ - if (sz == tipc_gap_ack_blks_sz(p->ugack_cnt + p->bgack_cnt)) { + if (sz == struct_size(p, gacks, p->ugack_cnt + p->bgack_cnt)) { /* Good, check if the desired type exists */ if ((uc && p->ugack_cnt) || (!uc && p->bgack_cnt)) goto ok; /* Backward compatible: peer might not support bc, but uc? */ - } else if (uc && sz == tipc_gap_ack_blks_sz(p->ugack_cnt)) { + } else if (uc && sz == struct_size(p, gacks, p->ugack_cnt)) { if (p->ugack_cnt) { p->bgack_cnt = 0; goto ok; @@ -1483,7 +1483,7 @@ static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr) __tipc_build_gap_ack_blks(ga, l, ga->bgack_cnt) : 0; /* Total len */ - len = tipc_gap_ack_blks_sz(ga->bgack_cnt + ga->ugack_cnt); + len = struct_size(ga, gacks, ga->bgack_cnt + ga->ugack_cnt); ga->len = htons(len); return len; } @@ -1532,7 +1532,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r, gacks = &ga->gacks[ga->bgack_cnt]; } else if (ga) { /* Copy the Gap ACKs, bc part, for later renewal if needed */ - this_ga = kmemdup(ga, tipc_gap_ack_blks_sz(ga->bgack_cnt), + this_ga = kmemdup(ga, struct_size(ga, gacks, ga->bgack_cnt), GFP_ATOMIC); if (likely(this_ga)) { this_ga->start_index = 0; @@ -2755,7 +2755,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg, void *hdr; struct nlattr *attrs; struct nlattr *prop; - u32 bc_mode = tipc_bcast_get_broadcast_mode(net); + u32 bc_mode = tipc_bcast_get_mode(net); u32 bc_ratio = tipc_bcast_get_broadcast_ratio(net); if (!bcl) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 01b64869a173..848fae674532 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -202,7 +202,7 @@ err: /** * tipc_msg_append(): Append data to tail of an existing buffer queue - * @hdr: header to be used + * @_hdr: header to be used * @m: the data to be appended * @mss: max allowable size of buffer * @dlen: size of data to be appended diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 58660d56bc83..1016e96db5c4 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -189,11 +189,9 @@ struct tipc_gap_ack_blks { struct tipc_gap_ack gacks[]; }; -#define tipc_gap_ack_blks_sz(n) (sizeof(struct tipc_gap_ack_blks) + \ - sizeof(struct tipc_gap_ack) * (n)) - #define MAX_GAP_ACK_BLKS 128 -#define MAX_GAP_ACK_BLKS_SZ tipc_gap_ack_blks_sz(MAX_GAP_ACK_BLKS) +#define MAX_GAP_ACK_BLKS_SZ (sizeof(struct tipc_gap_ack_blks) + \ + sizeof(struct tipc_gap_ack) * MAX_GAP_ACK_BLKS) static inline struct tipc_msg *buf_msg(struct sk_buff *skb) { @@ -438,6 +436,36 @@ static inline void msg_set_errcode(struct tipc_msg *m, u32 err) msg_set_bits(m, 1, 25, 0xf, err); } +static inline void msg_set_bulk(struct tipc_msg *m) +{ + msg_set_bits(m, 1, 28, 0x1, 1); +} + +static inline u32 msg_is_bulk(struct tipc_msg *m) +{ + return msg_bits(m, 1, 28, 0x1); +} + +static inline void msg_set_last_bulk(struct tipc_msg *m) +{ + msg_set_bits(m, 1, 27, 0x1, 1); +} + +static inline u32 msg_is_last_bulk(struct tipc_msg *m) +{ + return msg_bits(m, 1, 27, 0x1); +} + +static inline void msg_set_non_legacy(struct tipc_msg *m) +{ + msg_set_bits(m, 1, 26, 0x1, 1); +} + +static inline u32 msg_is_legacy(struct tipc_msg *m) +{ + return !msg_bits(m, 1, 26, 0x1); +} + static inline u32 msg_reroute_cnt(struct tipc_msg *m) { return msg_bits(m, 1, 21, 0xf); @@ -567,6 +595,16 @@ static inline void msg_set_origport(struct tipc_msg *m, u32 p) msg_set_word(m, 4, p); } +static inline u16 msg_named_seqno(struct tipc_msg *m) +{ + return msg_bits(m, 4, 0, 0xffff); +} + +static inline void msg_set_named_seqno(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 4, 0, 0xffff, n); +} + static inline u32 msg_destport(struct tipc_msg *m) { return msg_word(m, 5); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 5feaf3b67380..2f9c148f17e2 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -102,7 +102,8 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) pr_warn("Publication distribution failure\n"); return NULL; } - + msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); + msg_set_non_legacy(buf_msg(skb)); item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, publ); return skb; @@ -114,8 +115,8 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) { struct name_table *nt = tipc_name_table(net); - struct sk_buff *buf; struct distr_item *item; + struct sk_buff *skb; write_lock_bh(&nt->cluster_scope_lock); list_del(&publ->binding_node); @@ -123,15 +124,16 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) if (publ->scope == TIPC_NODE_SCOPE) return NULL; - buf = named_prepare_buf(net, WITHDRAWAL, ITEM_SIZE, 0); - if (!buf) { + skb = named_prepare_buf(net, WITHDRAWAL, ITEM_SIZE, 0); + if (!skb) { pr_warn("Withdrawal distribution failure\n"); return NULL; } - - item = (struct distr_item *)msg_data(buf_msg(buf)); + msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); + msg_set_non_legacy(buf_msg(skb)); + item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, publ); - return buf; + return skb; } /** @@ -141,7 +143,7 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) * @pls: linked list of publication items to be packed into buffer chain */ static void named_distribute(struct net *net, struct sk_buff_head *list, - u32 dnode, struct list_head *pls) + u32 dnode, struct list_head *pls, u16 seqno) { struct publication *publ; struct sk_buff *skb = NULL; @@ -149,6 +151,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; + struct tipc_msg *hdr; list_for_each_entry(publ, pls, binding_node) { /* Prepare next buffer: */ @@ -159,8 +162,11 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, pr_warn("Bulk publication failure\n"); return; } - msg_set_bc_ack_invalid(buf_msg(skb), true); - item = (struct distr_item *)msg_data(buf_msg(skb)); + hdr = buf_msg(skb); + msg_set_bc_ack_invalid(hdr, true); + msg_set_bulk(hdr); + msg_set_non_legacy(hdr); + item = (struct distr_item *)msg_data(hdr); } /* Pack publication into message: */ @@ -176,24 +182,35 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, } } if (skb) { - msg_set_size(buf_msg(skb), INT_H_SIZE + (msg_dsz - msg_rem)); + hdr = buf_msg(skb); + msg_set_size(hdr, INT_H_SIZE + (msg_dsz - msg_rem)); skb_trim(skb, INT_H_SIZE + (msg_dsz - msg_rem)); __skb_queue_tail(list, skb); } + hdr = buf_msg(skb_peek_tail(list)); + msg_set_last_bulk(hdr); + msg_set_named_seqno(hdr, seqno); } /** * tipc_named_node_up - tell specified node about all publications by this node */ -void tipc_named_node_up(struct net *net, u32 dnode) +void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities) { struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); struct sk_buff_head head; + u16 seqno; __skb_queue_head_init(&head); + spin_lock_bh(&tn->nametbl_lock); + if (!(capabilities & TIPC_NAMED_BCAST)) + nt->rc_dests++; + seqno = nt->snd_nxt; + spin_unlock_bh(&tn->nametbl_lock); read_lock_bh(&nt->cluster_scope_lock); - named_distribute(net, &head, dnode, &nt->cluster_scope); + named_distribute(net, &head, dnode, &nt->cluster_scope, seqno); tipc_node_xmit(net, &head, dnode, 0); read_unlock_bh(&nt->cluster_scope_lock); } @@ -245,13 +262,21 @@ static void tipc_dist_queue_purge(struct net *net, u32 addr) spin_unlock_bh(&tn->nametbl_lock); } -void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr) +void tipc_publ_notify(struct net *net, struct list_head *nsub_list, + u32 addr, u16 capabilities) { + struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); + struct publication *publ, *tmp; list_for_each_entry_safe(publ, tmp, nsub_list, binding_node) tipc_publ_purge(net, publ, addr); tipc_dist_queue_purge(net, addr); + spin_lock_bh(&tn->nametbl_lock); + if (!(capabilities & TIPC_NAMED_BCAST)) + nt->rc_dests--; + spin_unlock_bh(&tn->nametbl_lock); } /** @@ -295,29 +320,62 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, return false; } +static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, + u16 *rcv_nxt, bool *open) +{ + struct sk_buff *skb, *tmp; + struct tipc_msg *hdr; + u16 seqno; + + skb_queue_walk_safe(namedq, skb, tmp) { + skb_linearize(skb); + hdr = buf_msg(skb); + seqno = msg_named_seqno(hdr); + if (msg_is_last_bulk(hdr)) { + *rcv_nxt = seqno; + *open = true; + } + + if (msg_is_bulk(hdr) || msg_is_legacy(hdr)) { + __skb_unlink(skb, namedq); + return skb; + } + + if (*open && (*rcv_nxt == seqno)) { + (*rcv_nxt)++; + __skb_unlink(skb, namedq); + return skb; + } + + if (less(seqno, *rcv_nxt)) { + __skb_unlink(skb, namedq); + kfree_skb(skb); + continue; + } + } + return NULL; +} + /** * tipc_named_rcv - process name table update messages sent by another node */ -void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq) +void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq, + u16 *rcv_nxt, bool *open) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_msg *msg; + struct tipc_net *tn = tipc_net(net); struct distr_item *item; - uint count; - u32 node; + struct tipc_msg *hdr; struct sk_buff *skb; - int mtype; + u32 count, node; spin_lock_bh(&tn->nametbl_lock); - for (skb = skb_dequeue(inputq); skb; skb = skb_dequeue(inputq)) { - skb_linearize(skb); - msg = buf_msg(skb); - mtype = msg_type(msg); - item = (struct distr_item *)msg_data(msg); - count = msg_data_sz(msg) / ITEM_SIZE; - node = msg_orignode(msg); + while ((skb = tipc_named_dequeue(namedq, rcv_nxt, open))) { + hdr = buf_msg(skb); + node = msg_orignode(hdr); + item = (struct distr_item *)msg_data(hdr); + count = msg_data_sz(hdr) / ITEM_SIZE; while (count--) { - tipc_update_nametbl(net, item, node, mtype); + tipc_update_nametbl(net, item, node, msg_type(hdr)); item++; } kfree_skb(skb); @@ -345,6 +403,6 @@ void tipc_named_reinit(struct net *net) publ->node = self; list_for_each_entry_rcu(publ, &nt->cluster_scope, binding_node) publ->node = self; - + nt->rc_dests = 0; spin_unlock_bh(&tn->nametbl_lock); } diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index 63fc73e0fa6c..092323158f06 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -67,11 +67,14 @@ struct distr_item { __be32 key; }; +void tipc_named_bcast(struct net *net, struct sk_buff *skb); struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ); struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ); -void tipc_named_node_up(struct net *net, u32 dnode); -void tipc_named_rcv(struct net *net, struct sk_buff_head *msg_queue); +void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities); +void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq, + u16 *rcv_nxt, bool *open); void tipc_named_reinit(struct net *net); -void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr); +void tipc_publ_notify(struct net *net, struct list_head *nsub_list, + u32 addr, u16 capabilities); #endif diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 359b2bc888cf..2ac33d32edc2 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -729,6 +729,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, struct tipc_net *tn = tipc_net(net); struct publication *p = NULL; struct sk_buff *skb = NULL; + u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); @@ -743,12 +744,14 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, nt->local_publ_count++; skb = tipc_named_publish(net, p); } + rc_dests = nt->rc_dests; exit: spin_unlock_bh(&tn->nametbl_lock); if (skb) - tipc_node_broadcast(net, skb); + tipc_node_broadcast(net, skb, rc_dests); return p; + } /** @@ -762,6 +765,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 self = tipc_own_addr(net); struct sk_buff *skb = NULL; struct publication *p; + u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); @@ -775,10 +779,11 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, pr_err("Failed to remove local publication {%u,%u,%u}/%u\n", type, lower, upper, key); } + rc_dests = nt->rc_dests; spin_unlock_bh(&tn->nametbl_lock); if (skb) { - tipc_node_broadcast(net, skb); + tipc_node_broadcast(net, skb, rc_dests); return 1; } return 0; diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 728bc7016c38..8064e1986e2c 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -106,6 +106,8 @@ struct name_table { struct list_head cluster_scope; rwlock_t cluster_scope_lock; u32 local_publ_count; + u32 rc_dests; + u32 snd_nxt; }; int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/tipc/node.c b/net/tipc/node.c index a4c2816c3746..4edcee3088da 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -75,6 +75,8 @@ struct tipc_bclink_entry { struct sk_buff_head arrvq; struct sk_buff_head inputq2; struct sk_buff_head namedq; + u16 named_rcv_nxt; + bool named_open; }; /** @@ -396,10 +398,10 @@ static void tipc_node_write_unlock(struct tipc_node *n) write_unlock_bh(&n->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) - tipc_publ_notify(net, publ_list, addr); + tipc_publ_notify(net, publ_list, addr, n->capabilities); if (flags & TIPC_NOTIFY_NODE_UP) - tipc_named_node_up(net, addr); + tipc_named_node_up(net, addr, n->capabilities); if (flags & TIPC_NOTIFY_LINK_UP) { tipc_mon_peer_up(net, addr, bearer_id); @@ -1483,6 +1485,7 @@ static void node_lost_contact(struct tipc_node *n, /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); + __skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { @@ -1512,7 +1515,7 @@ static void node_lost_contact(struct tipc_node *n, * tipc_node_get_linkname - get the name of a link * * @bearer_id: id of the bearer - * @node: peer node address + * @addr: peer node address * @linkname: link name output buffer * * Returns 0 on success @@ -1729,12 +1732,23 @@ int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) return 0; } -void tipc_node_broadcast(struct net *net, struct sk_buff *skb) +void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests) { + struct sk_buff_head xmitq; struct sk_buff *txskb; struct tipc_node *n; + u16 dummy; u32 dst; + /* Use broadcast if all nodes support it */ + if (!rc_dests && tipc_bcast_get_mode(net) != BCLINK_MODE_RCAST) { + __skb_queue_head_init(&xmitq); + __skb_queue_tail(&xmitq, skb); + tipc_bcast_xmit(net, &xmitq, &dummy); + return; + } + + /* Otherwise use legacy replicast method */ rcu_read_lock(); list_for_each_entry_rcu(n, tipc_nodes(net), list) { dst = n->addr; @@ -1749,7 +1763,6 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb) tipc_node_xmit_skb(net, txskb, dst, 0); } rcu_read_unlock(); - kfree_skb(skb); } @@ -1844,7 +1857,9 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */ if (!skb_queue_empty(&n->bc_entry.namedq)) - tipc_named_rcv(net, &n->bc_entry.namedq); + tipc_named_rcv(net, &n->bc_entry.namedq, + &n->bc_entry.named_rcv_nxt, + &n->bc_entry.named_open); /* If reassembly or retransmission failure => reset all links to peer */ if (rc & TIPC_LINK_DOWN_EVT) @@ -2007,7 +2022,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, * tipc_rcv - process TIPC packets/messages arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet - * @bearer: pointer to bearer message arrived on + * @b: pointer to bearer message arrived on * * Invoked with no locks held. Bearer pointer must point to a valid bearer * structure (i.e. cannot be NULL), but bearer can be inactive. @@ -2114,7 +2129,9 @@ rcv: tipc_node_link_down(n, bearer_id, false); if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) - tipc_named_rcv(net, &n->bc_entry.namedq); + tipc_named_rcv(net, &n->bc_entry.namedq, + &n->bc_entry.named_rcv_nxt, + &n->bc_entry.named_open); if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1))) tipc_node_mcast_rcv(n); diff --git a/net/tipc/node.h b/net/tipc/node.h index a6803b449a2c..9f6f13f1604f 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -55,7 +55,8 @@ enum { TIPC_MCAST_RBCTL = (1 << 7), TIPC_GAP_ACK_BLOCK = (1 << 8), TIPC_TUNNEL_ENHANCED = (1 << 9), - TIPC_NAGLE = (1 << 10) + TIPC_NAGLE = (1 << 10), + TIPC_NAMED_BCAST = (1 << 11) }; #define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \ @@ -68,7 +69,8 @@ enum { TIPC_MCAST_RBCTL | \ TIPC_GAP_ACK_BLOCK | \ TIPC_TUNNEL_ENHANCED | \ - TIPC_NAGLE) + TIPC_NAGLE | \ + TIPC_NAMED_BCAST) #define INVALID_BEARER_ID -1 @@ -101,7 +103,7 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, u32 selector); void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr); void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr); -void tipc_node_broadcast(struct net *net, struct sk_buff *skb); +void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index a94f38333698..07419f36116a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -711,7 +711,6 @@ exit: * tipc_getname - get port ID of socket or peer socket * @sock: socket structure * @uaddr: area for returned socket address - * @uaddr_len: area for returned length of socket address * @peer: 0 = own ID, 1 = current peer ID, 2 = current/former peer ID * * Returns 0 on success, errno otherwise @@ -1053,7 +1052,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, /** * tipc_send_group_bcast - send message to all members in communication group - * @sk: socket structure + * @sock: socket structure * @m: message to send * @dlen: total length of message data * @timeout: timeout to wait for wakeup @@ -1673,7 +1672,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, /** * tipc_sk_set_orig_addr - capture sender's address for received message * @m: descriptor for message info - * @hdr: received message header + * @skb: received message * * Note: Address is not captured if not requested by receiver. */ @@ -2095,7 +2094,6 @@ static void tipc_write_space(struct sock *sk) /** * tipc_data_ready - wake up threads to indicate messages have been received * @sk: socket - * @len: the length of messages */ static void tipc_data_ready(struct sock *sk) { @@ -2677,7 +2675,7 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) /** * tipc_accept - wait for connection request * @sock: listening socket - * @newsock: new socket that is to be connected + * @new_sock: new socket that is to be connected * @flags: file-related flags associated with socket * * Returns 0 on success, errno otherwise @@ -3105,7 +3103,7 @@ static int tipc_sk_leave(struct tipc_sock *tsk) * Returns 0 on success, errno otherwise */ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, - char __user *ov, unsigned int ol) + sockptr_t ov, unsigned int ol) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); @@ -3126,17 +3124,17 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_NODELAY: if (ol < sizeof(value)) return -EINVAL; - if (get_user(value, (u32 __user *)ov)) + if (copy_from_sockptr(&value, ov, sizeof(u32))) return -EFAULT; break; case TIPC_GROUP_JOIN: if (ol < sizeof(mreq)) return -EINVAL; - if (copy_from_user(&mreq, ov, sizeof(mreq))) + if (copy_from_sockptr(&mreq, ov, sizeof(mreq))) return -EFAULT; break; default: - if (ov || ol) + if (!sockptr_is_null(ov) || ol) return -EINVAL; } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 28a283f26a8d..53f0de0676b7 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -565,7 +565,7 @@ msg_full: /** * tipc_parse_udp_addr - build udp media address from netlink data - * @nlattr: netlink attribute containing sockaddr storage aligned address + * @nla: netlink attribute containing sockaddr storage aligned address * @addr: tipc media address to fill with address, port and protocol type * @scope_id: IPv6 scope id pointer, not NULL indicates it's required */ @@ -738,6 +738,13 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, b->mtu = b->media->mtu; #if IS_ENABLED(CONFIG_IPV6) } else if (local.proto == htons(ETH_P_IPV6)) { + struct net_device *dev; + + dev = ipv6_dev_find(net, &local.ipv6); + if (!dev) { + err = -ENODEV; + goto err; + } udp_conf.family = AF_INET6; udp_conf.use_udp6_tx_checksums = true; udp_conf.use_udp6_rx_checksums = true; @@ -745,6 +752,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, udp_conf.local_ip6 = in6addr_any; else udp_conf.local_ip6 = local.ipv6; + ub->ifindex = dev->ifindex; b->mtu = 1280; #endif } else { diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 0e55f8365ce2..18fa6067bb7f 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -690,15 +690,55 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx, TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC); } +static bool +tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, + s64 resync_req, u32 *seq) +{ + u32 is_async = resync_req & RESYNC_REQ_ASYNC; + u32 req_seq = resync_req >> 32; + u32 req_end = req_seq + ((resync_req >> 16) & 0xffff); + + if (is_async) { + /* asynchronous stage: log all headers seq such that + * req_seq <= seq <= end_seq, and wait for real resync request + */ + if (between(*seq, req_seq, req_end) && + resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX) + resync_async->log[resync_async->loglen++] = *seq; + + return false; + } + + /* synchronous stage: check against the logged entries and + * proceed to check the next entries if no match was found + */ + while (resync_async->loglen) { + if (req_seq == resync_async->log[resync_async->loglen - 1] && + atomic64_try_cmpxchg(&resync_async->req, + &resync_req, 0)) { + resync_async->loglen = 0; + *seq = req_seq; + return true; + } + resync_async->loglen--; + } + + if (req_seq == *seq && + atomic64_try_cmpxchg(&resync_async->req, + &resync_req, 0)) + return true; + + return false; +} + void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; - bool is_req_pending, is_force_resync; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; + u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; - u32 sock_data; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) @@ -713,11 +753,9 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) resync_req = atomic64_read(&rx_ctx->resync_req); req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; - is_req_pending = resync_req & RESYNC_REQ; - is_force_resync = resync_req & RESYNC_REQ_FORCE; + is_req_pending = resync_req; - if (likely(!is_req_pending) || - (!is_force_resync && req_seq != seq) || + if (likely(!is_req_pending) || req_seq != seq || !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) return; break; @@ -739,6 +777,16 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) seq += rcd_len; tls_bigint_increment(rcd_sn, prot->rec_seq_size); break; + case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC: + resync_req = atomic64_read(&rx_ctx->resync_async->req); + is_req_pending = resync_req; + if (likely(!is_req_pending)) + return; + + if (!tls_device_rx_resync_async(rx_ctx->resync_async, + resync_req, &seq)) + return; + break; } tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index ec10041c6b7d..bbc52b088d29 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -450,7 +450,7 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, return do_tls_getsockopt(sk, optname, optval, optlen); } -static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, +static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, unsigned int optlen, int tx) { struct tls_crypto_info *crypto_info; @@ -460,7 +460,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, int rc = 0; int conf; - if (!optval || (optlen < sizeof(*crypto_info))) { + if (sockptr_is_null(optval) || (optlen < sizeof(*crypto_info))) { rc = -EINVAL; goto out; } @@ -479,7 +479,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, goto out; } - rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); + rc = copy_from_sockptr(crypto_info, optval, sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto err_crypto_info; @@ -522,8 +522,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, goto err_crypto_info; } - rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), - optlen - sizeof(*crypto_info)); + rc = copy_from_sockptr_offset(crypto_info + 1, optval, + sizeof(*crypto_info), + optlen - sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto err_crypto_info; @@ -579,8 +580,8 @@ out: return rc; } -static int do_tls_setsockopt(struct sock *sk, int optname, - char __user *optval, unsigned int optlen) +static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, + unsigned int optlen) { int rc = 0; @@ -600,7 +601,7 @@ static int do_tls_setsockopt(struct sock *sk, int optname, } static int tls_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct tls_context *ctx = tls_get_ctx(sk); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3385a7a0b231..181ea6fb56a6 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -714,8 +714,6 @@ static const struct proto_ops unix_stream_ops = { #endif .listen = unix_listen, .shutdown = unix_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .mmap = sock_no_mmap, @@ -741,8 +739,6 @@ static const struct proto_ops unix_dgram_ops = { #endif .listen = sock_no_listen, .shutdown = unix_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = unix_dgram_sendmsg, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, @@ -767,8 +763,6 @@ static const struct proto_ops unix_seqpacket_ops = { #endif .listen = unix_listen, .shutdown = unix_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = unix_seqpacket_sendmsg, .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 626bf9044418..27bbcfad9c17 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1202,8 +1202,6 @@ static const struct proto_ops vsock_dgram_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, .sendmsg = vsock_dgram_sendmsg, .recvmsg = vsock_dgram_recvmsg, .mmap = sock_no_mmap, @@ -1519,7 +1517,7 @@ static void vsock_update_buffer_size(struct vsock_sock *vsk, static int vsock_stream_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, + sockptr_t optval, unsigned int optlen) { int err; @@ -1537,7 +1535,7 @@ static int vsock_stream_setsockopt(struct socket *sock, err = -EINVAL; \ goto exit; \ } \ - if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \ + if (copy_from_sockptr(&_v, optval, sizeof(_v)) != 0) { \ err = -EFAULT; \ goto exit; \ } \ diff --git a/net/wireless/chan.c b/net/wireless/chan.c index cddf92c5d09e..90f0f82cd9ca 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -153,6 +153,11 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20: @@ -263,6 +268,21 @@ static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) int width; switch (c->width) { + case NL80211_CHAN_WIDTH_1: + width = 1; + break; + case NL80211_CHAN_WIDTH_2: + width = 2; + break; + case NL80211_CHAN_WIDTH_4: + width = 4; + break; + case NL80211_CHAN_WIDTH_8: + width = 8; + break; + case NL80211_CHAN_WIDTH_16: + width = 16; + break; case NL80211_CHAN_WIDTH_5: width = 5; break; @@ -911,6 +931,21 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + width = 1; + break; + case NL80211_CHAN_WIDTH_2: + width = 2; + break; + case NL80211_CHAN_WIDTH_4: + width = 4; + break; + case NL80211_CHAN_WIDTH_8: + width = 8; + break; + case NL80211_CHAN_WIDTH_16: + width = 16; + break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/core.c b/net/wireless/core.c index c623d9bf5096..1971d7e6eb55 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -803,10 +803,11 @@ int wiphy_register(struct wiphy *wiphy) if (WARN_ON(!sband->n_channels)) return -EINVAL; /* - * on 60GHz band, there are no legacy rates, so + * on 60GHz or sub-1Ghz band, there are no legacy rates, so * n_bitrates is 0 */ - if (WARN_ON(band != NL80211_BAND_60GHZ && + if (WARN_ON((band != NL80211_BAND_60GHZ && + band != NL80211_BAND_S1GHZ) && !sband->n_bitrates)) return -EINVAL; diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index eac5aa1419fc..e4e363138279 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -78,6 +78,7 @@ const struct mesh_config default_mesh_config = { .power_mode = NL80211_MESH_POWER_ACTIVE, .dot11MeshAwakeWindowDuration = MESH_DEFAULT_AWAKE_WINDOW, .plink_timeout = MESH_DEFAULT_PLINK_TIMEOUT, + .dot11MeshNolearn = false, }; const struct mesh_setup default_mesh_setup = { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7fbca0854265..814e23d3ce7c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4713,8 +4713,8 @@ static int nl80211_parse_he_bss_color(struct nlattr *attrs, he_bss_color->color = nla_get_u8(tb[NL80211_HE_BSS_COLOR_ATTR_COLOR]); - he_bss_color->disabled = - nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_DISABLED]); + he_bss_color->enabled = + !nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_DISABLED]); he_bss_color->partial = nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_PARTIAL]); @@ -5395,6 +5395,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(PEER_PM, peer_pm, u32); PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8); + PUT_SINFO(CONNECTED_TO_AS, connected_to_as, u8); if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { bss_param = nla_nest_start_noflag(msg, @@ -6885,7 +6886,11 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT, cur_params.plink_timeout) || nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE, - cur_params.dot11MeshConnectedToMeshGate)) + cur_params.dot11MeshConnectedToMeshGate) || + nla_put_u8(msg, NL80211_MESHCONF_NOLEARN, + cur_params.dot11MeshNolearn) || + nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_AS, + cur_params.dot11MeshConnectedToAuthServer)) goto nla_put_failure; nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); @@ -6943,6 +6948,8 @@ nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 }, [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, [NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1), + [NL80211_MESHCONF_NOLEARN] = NLA_POLICY_RANGE(NLA_U8, 0, 1), + [NL80211_MESHCONF_CONNECTED_TO_AS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), }; static const struct nla_policy @@ -7055,6 +7062,9 @@ do { \ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToMeshGate, mask, NL80211_MESHCONF_CONNECTED_TO_GATE, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToAuthServer, mask, + NL80211_MESHCONF_CONNECTED_TO_AS, + nla_get_u8); /* * Check HT operation mode based on * IEEE 802.11-2016 9.4.2.57 HT Operation element. @@ -7094,6 +7104,8 @@ do { \ NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, mask, NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNolearn, mask, + NL80211_MESHCONF_NOLEARN, nla_get_u8); if (mask_out) *mask_out = mask; @@ -7774,10 +7786,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->scan) return -EOPNOTSUPP; - if (rdev->scan_req || rdev->scan_msg) { - err = -EBUSY; - goto unlock; - } + if (rdev->scan_req || rdev->scan_msg) + return -EBUSY; if (info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]) { if (!wiphy_ext_feature_isset(wiphy, @@ -7790,10 +7800,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (scan_freqs) { n_channels = validate_scan_freqs(scan_freqs); - if (!n_channels) { - err = -EINVAL; - goto unlock; - } + if (!n_channels) + return -EINVAL; } else { n_channels = ieee80211_get_num_supported_channels(wiphy); } @@ -7802,29 +7810,23 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) n_ssids++; - if (n_ssids > wiphy->max_scan_ssids) { - err = -EINVAL; - goto unlock; - } + if (n_ssids > wiphy->max_scan_ssids) + return -EINVAL; if (info->attrs[NL80211_ATTR_IE]) ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); else ie_len = 0; - if (ie_len > wiphy->max_scan_ie_len) { - err = -EINVAL; - goto unlock; - } + if (ie_len > wiphy->max_scan_ie_len) + return -EINVAL; request = kzalloc(sizeof(*request) + sizeof(*request->ssids) * n_ssids + sizeof(*request->channels) * n_channels + ie_len, GFP_KERNEL); - if (!request) { - err = -ENOMEM; - goto unlock; - } + if (!request) + return -ENOMEM; if (n_ssids) request->ssids = (void *)&request->channels[n_channels]; @@ -8003,17 +8005,19 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) rdev->scan_req = request; err = rdev_scan(rdev, request); - if (!err) { - nl80211_send_scan_start(rdev, wdev); - if (wdev->netdev) - dev_hold(wdev->netdev); - } else { + if (err) + goto out_free; + + nl80211_send_scan_start(rdev, wdev); + if (wdev->netdev) + dev_hold(wdev->netdev); + + return 0; + out_free: - rdev->scan_req = NULL; - kfree(request); - } + rdev->scan_req = NULL; + kfree(request); - unlock: return err; } @@ -9438,7 +9442,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, if (nla_len(info->attrs[NL80211_ATTR_PMK]) != WLAN_PMK_LEN) return -EINVAL; if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK)) + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK)) return -EINVAL; settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]); } @@ -10394,8 +10400,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) memcpy(dev->ieee80211_ptr->disconnect_bssid, connect.bssid, ETH_ALEN); else - memset(dev->ieee80211_ptr->disconnect_bssid, - 0, ETH_ALEN); + eth_zero_addr(dev->ieee80211_ptr->disconnect_bssid); } wdev_unlock(dev->ieee80211_ptr); @@ -13479,7 +13484,7 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb, if (err == -ENOBUFS || err == -ENOENT) { genlmsg_cancel(skb, hdr); break; - } else if (err) { + } else if (err <= 0) { genlmsg_cancel(skb, hdr); goto out; } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 0d74a31ef0ab..35b8847a2f6d 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2384,7 +2384,7 @@ static void reg_set_request_processed(void) /** * reg_process_hint_core - process core regulatory requests - * @pending_request: a pending core regulatory request + * @core_request: a pending core regulatory request * * The wireless subsystem can use this function to process * a regulatory request issued by the regulatory core. @@ -2493,6 +2493,7 @@ __reg_process_hint_driver(struct regulatory_request *driver_request) /** * reg_process_hint_driver - process driver regulatory requests + * @wiphy: the wireless device for the regulatory request * @driver_request: a pending driver regulatory request * * The wireless subsystem can use this function to process @@ -2593,6 +2594,7 @@ __reg_process_hint_country_ie(struct wiphy *wiphy, /** * reg_process_hint_country_ie - process regulatory requests from country IEs + * @wiphy: the wireless device for the regulatory request * @country_ie_request: a regulatory request from a country IE * * The wireless subsystem can use this function to process diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 74ea4cfb39fb..e67a74488bbe 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -712,6 +712,16 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev) __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE); } +void cfg80211_bss_flush(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + + spin_lock_bh(&rdev->bss_lock); + __cfg80211_bss_expire(rdev, jiffies); + spin_unlock_bh(&rdev->bss_lock); +} +EXPORT_SYMBOL(cfg80211_bss_flush); + const struct element * cfg80211_find_elem_match(u8 eid, const u8 *ies, unsigned int len, const u8 *match, unsigned int match_len, diff --git a/net/wireless/trace.h b/net/wireless/trace.h index b23cab016521..6e218a0acd4e 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -68,7 +68,8 @@ __field(u16, ht_opmode) \ __field(u32, dot11MeshHWMPactivePathToRootTimeout) \ __field(u16, dot11MeshHWMProotInterval) \ - __field(u16, dot11MeshHWMPconfirmationInterval) + __field(u16, dot11MeshHWMPconfirmationInterval) \ + __field(bool, dot11MeshNolearn) #define MESH_CFG_ASSIGN \ do { \ __entry->dot11MeshRetryTimeout = conf->dot11MeshRetryTimeout; \ @@ -109,6 +110,7 @@ conf->dot11MeshHWMProotInterval; \ __entry->dot11MeshHWMPconfirmationInterval = \ conf->dot11MeshHWMPconfirmationInterval; \ + __entry->dot11MeshNolearn = conf->dot11MeshNolearn; \ } while (0) #define CHAN_ENTRY __field(enum nl80211_band, band) \ diff --git a/net/wireless/util.c b/net/wireless/util.c index 4d3b76f94f55..26a977343c3b 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -102,6 +102,8 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) if (chan < 7) return MHZ_TO_KHZ(56160 + chan * 2160); break; + case NL80211_BAND_S1GHZ: + return 902000 + chan * 500; default: ; } @@ -210,6 +212,12 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; + case NL80211_BAND_S1GHZ: + /* Figure 9-589bd: 3 means unsupported, so != 3 means at least + * mandatory is ok. + */ + WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3); + break; case NUM_NL80211_BANDS: default: WARN_ON(1); diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index cac9e28d852b..aa918d7ff6bd 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -220,7 +220,6 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_giwrange); /** * cfg80211_wext_freq - get wext frequency for non-"auto" - * @dev: the net device * @freq: the wext freq encoding * * Returns a frequency, or a negative error code, or 0 for auto. diff --git a/net/x25/Kconfig b/net/x25/Kconfig index e3ed23245a82..68729aa3a5d5 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -17,7 +17,7 @@ config X25 if you want that) and the lower level data link layer protocol LAPB (say Y to "LAPB Data Link Driver" below if you want that). - You can read more about X.25 at <http://www.sangoma.com/tutorials/x25/> and + You can read more about X.25 at <https://www.sangoma.com/tutorials/x25/> and <http://docwiki.cisco.com/wiki/X.25>. Information about X.25 for Linux is contained in the files <file:Documentation/networking/x25.rst> and diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index d5b09bbff375..0bbb283f23c9 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -431,7 +431,7 @@ void x25_destroy_socket_from_timer(struct sock *sk) */ static int x25_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { int opt; struct sock *sk = sock->sk; @@ -445,7 +445,7 @@ static int x25_setsockopt(struct socket *sock, int level, int optname, goto out; rc = -EFAULT; - if (get_user(opt, (int __user *)optval)) + if (copy_from_sockptr(&opt, optval, sizeof(int))) goto out; if (opt) diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index 7d02532aad0d..fdae054b7dc1 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -270,7 +270,7 @@ void x25_link_device_up(struct net_device *dev) /** * __x25_remove_neigh - remove neighbour from x25_neigh_list - * @nb - neigh to remove + * @nb: - neigh to remove * * Remove neighbour from x25_neigh_list. If it was there. * Caller must hold x25_neigh_list_lock. diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c index b8e94d58d0f1..00e46c9a5280 100644 --- a/net/x25/x25_route.c +++ b/net/x25/x25_route.c @@ -142,7 +142,7 @@ struct net_device *x25_dev_get(char *devname) /** * x25_get_route - Find a route given an X.25 address. - * @addr - address to find a route for + * @addr: - address to find a route for * * Find a route given an X.25 address. */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3700266229f6..c3231620d210 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -123,7 +123,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) addr = xp_get_handle(xskb); err = xskq_prod_reserve_desc(xs->rx, addr, len); if (err) { - xs->rx_dropped++; + xs->rx_queue_full++; return err; } @@ -274,8 +274,10 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { - if (!xskq_cons_peek_desc(xs->tx, desc, umem)) + if (!xskq_cons_peek_desc(xs->tx, desc, umem)) { + xs->tx->queue_empty_descs++; continue; + } /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed @@ -387,6 +389,8 @@ static int xsk_generic_xmit(struct sock *sk) sent_frame = true; } + xs->tx->queue_empty_descs++; + out: if (sent_frame) sk->sk_write_space(sk); @@ -698,7 +702,7 @@ struct xdp_umem_reg_v1 { }; static int xsk_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); @@ -716,7 +720,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(entries)) return -EINVAL; - if (copy_from_user(&entries, optval, sizeof(entries))) + if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); @@ -743,7 +747,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, else if (optlen < sizeof(mr)) mr_size = sizeof(struct xdp_umem_reg_v1); - if (copy_from_user(&mr, optval, mr_size)) + if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; mutex_lock(&xs->mutex); @@ -770,7 +774,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; - if (copy_from_user(&entries, optval, sizeof(entries))) + if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); @@ -812,6 +816,12 @@ static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) ring->desc = offsetof(struct xdp_umem_ring, desc); } +struct xdp_statistics_v1 { + __u64 rx_dropped; + __u64 rx_invalid_descs; + __u64 tx_invalid_descs; +}; + static int xsk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -830,20 +840,36 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, switch (optname) { case XDP_STATISTICS: { - struct xdp_statistics stats; + struct xdp_statistics stats = {}; + bool extra_stats = true; + size_t stats_size; - if (len < sizeof(stats)) + if (len < sizeof(struct xdp_statistics_v1)) { return -EINVAL; + } else if (len < sizeof(stats)) { + extra_stats = false; + stats_size = sizeof(struct xdp_statistics_v1); + } else { + stats_size = sizeof(stats); + } mutex_lock(&xs->mutex); stats.rx_dropped = xs->rx_dropped; + if (extra_stats) { + stats.rx_ring_full = xs->rx_queue_full; + stats.rx_fill_ring_empty_descs = + xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); + } else { + stats.rx_dropped += xs->rx_queue_full; + } stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); mutex_unlock(&xs->mutex); - if (copy_to_user(optval, &stats, sizeof(stats))) + if (copy_to_user(optval, &stats, stats_size)) return -EFAULT; - if (put_user(sizeof(stats), optlen)) + if (put_user(stats_size, optlen)) return -EFAULT; return 0; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 08b80669f649..a2044c245215 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -189,6 +189,7 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) for (;;) { if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + pool->fq->queue_empty_descs++; xp_release(xskb); return NULL; } diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 0163b26aaf63..21e9c2d123ee 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -76,6 +76,19 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) return err; } +static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_diag_stats du = {}; + + du.n_rx_dropped = xs->rx_dropped; + du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx); + du.n_rx_full = xs->rx_queue_full; + du.n_fill_ring_empty = xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx); + du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx); + return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du); +} + static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, struct xdp_diag_req *req, struct user_namespace *user_ns, @@ -118,6 +131,10 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) goto out_nlmsg_trim; + if ((req->xdiag_show & XDP_SHOW_STATS) && + xsk_diag_put_stats(xs, nlskb)) + goto out_nlmsg_trim; + mutex_unlock(&xs->mutex); nlmsg_end(nlskb, nlh); return 0; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5b5d24d2dd37..bf42cfd74b89 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -38,6 +38,7 @@ struct xsk_queue { u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; + u64 queue_empty_descs; }; /* The structure of the shared state of the rings are the same as the @@ -354,6 +355,11 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } +static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q) +{ + return q ? q->queue_empty_descs : 0; +} + struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 1dc7208c71ba..8367adbbe9df 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -254,6 +254,7 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, spin_unlock_bh(&map->lock); } +static int xsk_map_btf_id; const struct bpf_map_ops xsk_map_ops = { .map_alloc = xsk_map_alloc, .map_free = xsk_map_free, @@ -264,4 +265,6 @@ const struct bpf_map_ops xsk_map_ops = { .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "xsk_map", + .map_btf_id = &xsk_map_btf_id, }; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 626096bd0d29..edf11893dbe8 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -106,6 +106,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur struct sk_buff *skb2, *nskb, *pskb = NULL; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); + struct net_device *dev = skb->dev; struct sec_path *sp; if (!xo || (xo->flags & XFRM_XMIT)) @@ -119,6 +120,10 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) return skb; + /* This skb was already validated on the upper/virtual dev */ + if ((x->xso.dev != dev) && (x->xso.real_dev == dev)) + return skb; + local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); err = !skb_queue_empty(&sd->xfrm_backlog); @@ -131,25 +136,20 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur xo->flags |= XFRM_XMIT; - if (skb_is_gso(skb)) { - struct net_device *dev = skb->dev; - - if (unlikely(x->xso.dev != dev)) { - struct sk_buff *segs; + if (skb_is_gso(skb) && unlikely(x->xso.dev != dev)) { + struct sk_buff *segs; - /* Packet got rerouted, fixup features and segment it. */ - esp_features = esp_features & ~(NETIF_F_HW_ESP - | NETIF_F_GSO_ESP); + /* Packet got rerouted, fixup features and segment it. */ + esp_features = esp_features & ~(NETIF_F_HW_ESP | NETIF_F_GSO_ESP); - segs = skb_gso_segment(skb, esp_features); - if (IS_ERR(segs)) { - kfree_skb(skb); - atomic_long_inc(&dev->tx_dropped); - return NULL; - } else { - consume_skb(skb); - skb = segs; - } + segs = skb_gso_segment(skb, esp_features); + if (IS_ERR(segs)) { + kfree_skb(skb); + atomic_long_inc(&dev->tx_dropped); + return NULL; + } else { + consume_skb(skb); + skb = segs; } } @@ -261,6 +261,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } xso->dev = dev; + xso->real_dev = dev; xso->num_exthdrs = 1; xso->flags = xuo->flags; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index bd984ff17c2d..37456d022cfa 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -42,7 +42,7 @@ struct xfrm_trans_cb { #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); -static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; +static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[2][AF_INET6 + 1]; static struct gro_cells gro_cells; static struct net_device xfrm_napi_dev; @@ -53,14 +53,14 @@ int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; - if (WARN_ON(afinfo->family >= ARRAY_SIZE(xfrm_input_afinfo))) + if (WARN_ON(afinfo->family > AF_INET6)) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_input_afinfo_lock); - if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL)) + if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) err = -EEXIST; else - rcu_assign_pointer(xfrm_input_afinfo[afinfo->family], afinfo); + rcu_assign_pointer(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], afinfo); spin_unlock_bh(&xfrm_input_afinfo_lock); return err; } @@ -71,11 +71,11 @@ int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo) int err = 0; spin_lock_bh(&xfrm_input_afinfo_lock); - if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) { - if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo)) + if (likely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) { + if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family] != afinfo)) err = -EINVAL; else - RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->family], NULL); + RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], NULL); } spin_unlock_bh(&xfrm_input_afinfo_lock); synchronize_rcu(); @@ -83,15 +83,15 @@ int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_input_unregister_afinfo); -static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family) +static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(u8 family, bool is_ipip) { const struct xfrm_input_afinfo *afinfo; - if (WARN_ON_ONCE(family >= ARRAY_SIZE(xfrm_input_afinfo))) + if (WARN_ON_ONCE(family > AF_INET6)) return NULL; rcu_read_lock(); - afinfo = rcu_dereference(xfrm_input_afinfo[family]); + afinfo = rcu_dereference(xfrm_input_afinfo[is_ipip][family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; @@ -100,9 +100,11 @@ static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, int err) { + bool is_ipip = (protocol == IPPROTO_IPIP || protocol == IPPROTO_IPV6); + const struct xfrm_input_afinfo *afinfo; int ret; - const struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family); + afinfo = xfrm_input_get_afinfo(family, is_ipip); if (!afinfo) return -EAFNOSUPPORT; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index b615729812e5..eb8181987620 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -48,21 +48,30 @@ static int xfrmi_dev_init(struct net_device *dev); static void xfrmi_dev_setup(struct net_device *dev); static struct rtnl_link_ops xfrmi_link_ops __read_mostly; static unsigned int xfrmi_net_id __read_mostly; +static const struct net_device_ops xfrmi_netdev_ops; + +#define XFRMI_HASH_BITS 8 +#define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) struct xfrmi_net { /* lists for storing interfaces in use */ - struct xfrm_if __rcu *xfrmi[1]; + struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; }; #define for_each_xfrmi_rcu(start, xi) \ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) +static u32 xfrmi_hash(u32 if_id) +{ + return hash_32(if_id, XFRMI_HASH_BITS); +} + static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if *xi; - for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { + for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { if (x->if_id == xi->p.if_id && (xi->dev->flags & IFF_UP)) return xi; @@ -74,8 +83,7 @@ static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb, unsigned short family) { - struct xfrmi_net *xfrmn; - struct xfrm_if *xi; + struct net_device *dev; int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) @@ -89,23 +97,26 @@ static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb, ifindex = inet_sdif(skb); break; } - if (!ifindex) - ifindex = skb->dev->ifindex; - xfrmn = net_generic(xs_net(xfrm_input_state(skb)), xfrmi_net_id); + if (ifindex) { + struct net *net = xs_net(xfrm_input_state(skb)); - for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { - if (ifindex == xi->dev->ifindex && - (xi->dev->flags & IFF_UP)) - return xi; + dev = dev_get_by_index_rcu(net, ifindex); + } else { + dev = skb->dev; } - return NULL; + if (!dev || !(dev->flags & IFF_UP)) + return NULL; + if (dev->netdev_ops != &xfrmi_netdev_ops) + return NULL; + + return netdev_priv(dev); } static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { - struct xfrm_if __rcu **xip = &xfrmn->xfrmi[0]; + struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); rcu_assign_pointer(*xip, xi); @@ -116,7 +127,7 @@ static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) struct xfrm_if __rcu **xip; struct xfrm_if *iter; - for (xip = &xfrmn->xfrmi[0]; + for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; (iter = rtnl_dereference(*xip)) != NULL; xip = &iter->next) { if (xi == iter) { @@ -160,7 +171,7 @@ static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) struct xfrm_if *xi; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - for (xip = &xfrmn->xfrmi[0]; + for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) if (xi->p.if_id == p->if_id) @@ -760,11 +771,14 @@ static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; struct xfrm_if *xi; + int i; - for (xip = &xfrmn->xfrmi[0]; - (xi = rtnl_dereference(*xip)) != NULL; - xip = &xi->next) - unregister_netdevice_queue(xi->dev, &list); + for (i = 0; i < XFRMI_HASH_SIZE; i++) { + for (xip = &xfrmn->xfrmi[i]; + (xi = rtnl_dereference(*xip)) != NULL; + xip = &xi->next) + unregister_netdevice_queue(xi->dev, &list); + } } unregister_netdevice_many(&list); rtnl_unlock(); @@ -800,6 +814,33 @@ static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .priority = 10, }; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) +static int xfrmi6_rcv_tunnel(struct sk_buff *skb) +{ + const xfrm_address_t *saddr; + __be32 spi; + + saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; + spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); + + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); +} + +static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { + .handler = xfrmi6_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = -1, +}; + +static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { + .handler = xfrmi6_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = -1, +}; +#endif + static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, @@ -824,6 +865,27 @@ static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { .priority = 10, }; +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) +static int xfrmi4_rcv_tunnel(struct sk_buff *skb) +{ + return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); +} + +static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { + .handler = xfrmi4_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = -1, +}; + +static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { + .handler = xfrmi4_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = -1, +}; +#endif + static int __init xfrmi4_init(void) { int err; @@ -837,9 +899,23 @@ static int __init xfrmi4_init(void) err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) + err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); + if (err < 0) + goto xfrm_tunnel_ipip_failed; + err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); + if (err < 0) + goto xfrm_tunnel_ipip6_failed; +#endif return 0; +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) +xfrm_tunnel_ipip6_failed: + xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); +xfrm_tunnel_ipip_failed: + xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); +#endif xfrm_proto_comp_failed: xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: @@ -850,6 +926,10 @@ xfrm_proto_esp_failed: static void xfrmi4_fini(void) { +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) + xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); + xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); +#endif xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); @@ -868,9 +948,23 @@ static int __init xfrmi6_init(void) err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); + if (err < 0) + goto xfrm_tunnel_ipv6_failed; + err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); + if (err < 0) + goto xfrm_tunnel_ip6ip_failed; +#endif return 0; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) +xfrm_tunnel_ip6ip_failed: + xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); +xfrm_tunnel_ipv6_failed: + xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); +#endif xfrm_proto_comp_failed: xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: @@ -881,6 +975,10 @@ xfrm_proto_esp_failed: static void xfrmi6_fini(void) { +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); + xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); +#endif xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 19c5e0fa3f44..042ea9b40c7b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2751,6 +2751,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; + __u32 skb_mark; spin_lock(&pq->hold_queue.lock); skb = skb_peek(&pq->hold_queue); @@ -2760,7 +2761,12 @@ static void xfrm_policy_queue_process(struct timer_list *t) } dst = skb_dst(skb); sk = skb->sk; + + /* Fixup the mark to support VTI. */ + skb_mark = skb->mark; + skb->mark = pol->mark.v; xfrm_decode_session(skb, &fl, dst->ops->family); + skb->mark = skb_mark; spin_unlock(&pq->hold_queue.lock); dst_hold(xfrm_dst_path(dst)); @@ -2792,7 +2798,12 @@ static void xfrm_policy_queue_process(struct timer_list *t) while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); + /* Fixup the mark to support VTI. */ + skb_mark = skb->mark; + skb->mark = pol->mark.v; xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); + skb->mark = skb_mark; + dst_hold(xfrm_dst_path(skb_dst(skb))); dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 98943f8d01aa..c6a4338a0d08 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -89,7 +89,8 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq; XFRM_SKB_CB(skb)->seq.output.hi = 0; - if (unlikely(x->replay.oseq == 0)) { + if (unlikely(x->replay.oseq == 0) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { x->replay.oseq--; xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -168,7 +169,8 @@ static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq; XFRM_SKB_CB(skb)->seq.output.hi = 0; - if (unlikely(replay_esn->oseq == 0)) { + if (unlikely(replay_esn->oseq == 0) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { replay_esn->oseq--; xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -572,7 +574,8 @@ static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *sk XFRM_SKB_CB(skb)->seq.output.hi = 0; xo->seq.hi = 0; - if (unlikely(oseq < x->replay.oseq)) { + if (unlikely(oseq < x->replay.oseq) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -611,7 +614,8 @@ static int xfrm_replay_overflow_offload_bmp(struct xfrm_state *x, struct sk_buff XFRM_SKB_CB(skb)->seq.output.hi = 0; xo->seq.hi = 0; - if (unlikely(oseq < replay_esn->oseq)) { + if (unlikely(oseq < replay_esn->oseq) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 8be2d926acc2..69520ad3d83b 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2264,7 +2264,7 @@ static bool km_is_alive(const struct km_event *c) return is_alive; } -int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen) +int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) { int err; u8 *data; @@ -2274,7 +2274,7 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen if (in_compat_syscall()) return -EOPNOTSUPP; - if (!optval && !optlen) { + if (sockptr_is_null(optval) && !optlen) { xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL); __sk_dst_reset(sk); @@ -2284,7 +2284,7 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen if (optlen <= 0 || optlen > PAGE_SIZE) return -EMSGSIZE; - data = memdup_user(optval, optlen); + data = memdup_sockptr(optval, optlen); if (IS_ERR(data)) return PTR_ERR(data); |