summaryrefslogtreecommitdiffstats
path: root/Documentation/networking
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/networking')
-rw-r--r--Documentation/networking/af_xdp.rst38
-rw-r--r--Documentation/networking/batman-adv.rst2
-rw-r--r--Documentation/networking/bonding.rst12
-rw-r--r--Documentation/networking/caif/caif.rst4
-rw-r--r--Documentation/networking/device_drivers/cellular/qualcomm/rmnet.rst126
-rw-r--r--Documentation/networking/device_drivers/ethernet/amazon/ena.rst164
-rw-r--r--Documentation/networking/device_drivers/ethernet/freescale/dpaa2/dpio-driver.rst1
-rw-r--r--Documentation/networking/device_drivers/ethernet/freescale/dpaa2/index.rst1
-rw-r--r--Documentation/networking/device_drivers/ethernet/freescale/dpaa2/switch-driver.rst217
-rw-r--r--Documentation/networking/device_drivers/ethernet/google/gve.rst53
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/i40e.rst6
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/iavf.rst2
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/ice.rst2
-rw-r--r--Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst132
-rw-r--r--Documentation/networking/device_drivers/index.rst1
-rw-r--r--Documentation/networking/device_drivers/wwan/index.rst18
-rw-r--r--Documentation/networking/device_drivers/wwan/iosm.rst96
-rw-r--r--Documentation/networking/devlink/devlink-params.rst12
-rw-r--r--Documentation/networking/devlink/devlink-port.rst35
-rw-r--r--Documentation/networking/devlink/devlink-region.rst2
-rw-r--r--Documentation/networking/devlink/devlink-trap.rst5
-rw-r--r--Documentation/networking/devlink/hns3.rst25
-rw-r--r--Documentation/networking/devlink/ice.rst9
-rw-r--r--Documentation/networking/devlink/index.rst3
-rw-r--r--Documentation/networking/devlink/netdevsim.rst26
-rw-r--r--Documentation/networking/devlink/prestera.rst141
-rw-r--r--Documentation/networking/devlink/sja1105.rst49
-rw-r--r--Documentation/networking/dsa/configuration.rst68
-rw-r--r--Documentation/networking/dsa/dsa.rst50
-rw-r--r--Documentation/networking/dsa/sja1105.rst281
-rw-r--r--Documentation/networking/ethtool-netlink.rst53
-rw-r--r--Documentation/networking/filter.rst27
-rw-r--r--Documentation/networking/index.rst2
-rw-r--r--Documentation/networking/ioam6-sysctl.rst26
-rw-r--r--Documentation/networking/ip-sysctl.rst114
-rw-r--r--Documentation/networking/mctp.rst213
-rw-r--r--Documentation/networking/mptcp-sysctl.rst41
-rw-r--r--Documentation/networking/netdev-FAQ.rst17
-rw-r--r--Documentation/networking/netdevices.rst29
-rw-r--r--Documentation/networking/nf_conntrack-sysctl.rst40
-rw-r--r--Documentation/networking/operstates.rst6
-rw-r--r--Documentation/networking/packet_mmap.rst2
-rw-r--r--Documentation/networking/phy.rst6
-rw-r--r--Documentation/networking/pktgen.rst18
-rw-r--r--Documentation/networking/timestamping.rst6
-rw-r--r--Documentation/networking/tipc.rst121
-rw-r--r--Documentation/networking/tuntap.rst2
-rw-r--r--Documentation/networking/vrf.rst13
48 files changed, 1843 insertions, 474 deletions
diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index 2ccc5644cc98..60b217b436be 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -243,8 +243,8 @@ Configuration Flags and Socket Options
These are the various configuration flags that can be used to control
and monitor the behavior of AF_XDP sockets.
-XDP_COPY and XDP_ZERO_COPY bind flags
--------------------------------------
+XDP_COPY and XDP_ZEROCOPY bind flags
+------------------------------------
When you bind to a socket, the kernel will first try to use zero-copy
copy. If zero-copy is not supported, it will fall back on using copy
@@ -252,7 +252,7 @@ mode, i.e. copying all packets out to user space. But if you would
like to force a certain mode, you can use the following flags. If you
pass the XDP_COPY flag to the bind call, the kernel will force the
socket into copy mode. If it cannot use copy mode, the bind call will
-fail with an error. Conversely, the XDP_ZERO_COPY flag will force the
+fail with an error. Conversely, the XDP_ZEROCOPY flag will force the
socket into zero-copy mode or fail.
XDP_SHARED_UMEM bind flag
@@ -290,19 +290,19 @@ round-robin example of distributing packets is shown below:
#define MAX_SOCKS 16
struct {
- __uint(type, BPF_MAP_TYPE_XSKMAP);
- __uint(max_entries, MAX_SOCKS);
- __uint(key_size, sizeof(int));
- __uint(value_size, sizeof(int));
+ __uint(type, BPF_MAP_TYPE_XSKMAP);
+ __uint(max_entries, MAX_SOCKS);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
} xsks_map SEC(".maps");
static unsigned int rr;
SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
{
- rr = (rr + 1) & (MAX_SOCKS - 1);
+ rr = (rr + 1) & (MAX_SOCKS - 1);
- return bpf_redirect_map(&xsks_map, rr, XDP_DROP);
+ return bpf_redirect_map(&xsks_map, rr, XDP_DROP);
}
Note, that since there is only a single set of FILL and COMPLETION
@@ -379,7 +379,7 @@ would look like this for the TX path:
.. code-block:: c
if (xsk_ring_prod__needs_wakeup(&my_tx_ring))
- sendto(xsk_socket__fd(xsk_handle), NULL, 0, MSG_DONTWAIT, NULL, 0);
+ sendto(xsk_socket__fd(xsk_handle), NULL, 0, MSG_DONTWAIT, NULL, 0);
I.e., only use the syscall if the flag is set.
@@ -442,9 +442,9 @@ purposes. The supported statistics are shown below:
.. code-block:: c
struct xdp_statistics {
- __u64 rx_dropped; /* Dropped for reasons other than invalid desc */
- __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
- __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
+ __u64 rx_dropped; /* Dropped for reasons other than invalid desc */
+ __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
+ __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
};
XDP_OPTIONS getsockopt
@@ -483,15 +483,15 @@ like this:
.. code-block:: c
// struct xdp_rxtx_ring {
- // __u32 *producer;
- // __u32 *consumer;
- // struct xdp_desc *desc;
+ // __u32 *producer;
+ // __u32 *consumer;
+ // struct xdp_desc *desc;
// };
// struct xdp_umem_ring {
- // __u32 *producer;
- // __u32 *consumer;
- // __u64 *desc;
+ // __u32 *producer;
+ // __u32 *consumer;
+ // __u64 *desc;
// };
// typedef struct xdp_rxtx_ring RING;
diff --git a/Documentation/networking/batman-adv.rst b/Documentation/networking/batman-adv.rst
index 74821d29a22f..b85563ea3682 100644
--- a/Documentation/networking/batman-adv.rst
+++ b/Documentation/networking/batman-adv.rst
@@ -157,7 +157,7 @@ Contact
Please send us comments, experiences, questions, anything :)
IRC:
- #batman on irc.freenode.org
+ #batadv on ircs://irc.hackint.org/
Mailing-list:
b.a.t.m.a.n@open-mesh.org (optional subscription at
https://lists.open-mesh.org/mailman3/postorius/lists/b.a.t.m.a.n.lists.open-mesh.org/)
diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index 62f2aab8eaec..31cfd7d674a6 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -501,6 +501,18 @@ fail_over_mac
This option was added in bonding version 3.2.0. The "follow"
policy was added in bonding version 3.3.0.
+lacp_active
+ Option specifying whether to send LACPDU frames periodically.
+
+ off or 0
+ LACPDU frames acts as "speak when spoken to".
+
+ on or 1
+ LACPDU frames are sent along the configured links
+ periodically. See lacp_rate for more details.
+
+ The default is on.
+
lacp_rate
Option specifying the rate in which we'll ask our link partner
diff --git a/Documentation/networking/caif/caif.rst b/Documentation/networking/caif/caif.rst
index 81a14373d780..d922d419c513 100644
--- a/Documentation/networking/caif/caif.rst
+++ b/Documentation/networking/caif/caif.rst
@@ -69,9 +69,9 @@ There are debugfs parameters provided for serial communication.
- 0x01 - tty->warned is on.
- 0x04 - tty->packed is on.
- - 0x08 - tty->flow_stopped is on.
+ - 0x08 - tty->flow.tco_stopped is on.
- 0x10 - tty->hw_stopped is on.
- - 0x20 - tty->stopped is on.
+ - 0x20 - tty->flow.stopped is on.
* last_tx_msg: Binary blob Prints the last transmitted frame.
diff --git a/Documentation/networking/device_drivers/cellular/qualcomm/rmnet.rst b/Documentation/networking/device_drivers/cellular/qualcomm/rmnet.rst
index 70643b58de05..4118384cf8eb 100644
--- a/Documentation/networking/device_drivers/cellular/qualcomm/rmnet.rst
+++ b/Documentation/networking/device_drivers/cellular/qualcomm/rmnet.rst
@@ -27,34 +27,136 @@ these MAP frames and send them to appropriate PDN's.
2. Packet format
================
-a. MAP packet (data / control)
+a. MAP packet v1 (data / control)
-MAP header has the same endianness of the IP packet.
+MAP header fields are in big endian format.
Packet format::
- Bit 0 1 2-7 8 - 15 16 - 31
+ Bit 0 1 2-7 8-15 16-31
Function Command / Data Reserved Pad Multiplexer ID Payload length
- Bit 32 - x
- Function Raw Bytes
+
+ Bit 32-x
+ Function Raw bytes
Command (1)/ Data (0) bit value is to indicate if the packet is a MAP command
-or data packet. Control packet is used for transport level flow control. Data
+or data packet. Command packet is used for transport level flow control. Data
packets are standard IP packets.
-Reserved bits are usually zeroed out and to be ignored by receiver.
+Reserved bits must be zero when sent and ignored when received.
-Padding is number of bytes to be added for 4 byte alignment if required by
-hardware.
+Padding is the number of bytes to be appended to the payload to
+ensure 4 byte alignment.
Multiplexer ID is to indicate the PDN on which data has to be sent.
Payload length includes the padding length but does not include MAP header
length.
-b. MAP packet (command specific)::
+b. Map packet v4 (data / control)
+
+MAP header fields are in big endian format.
+
+Packet format::
+
+ Bit 0 1 2-7 8-15 16-31
+ Function Command / Data Reserved Pad Multiplexer ID Payload length
+
+ Bit 32-(x-33) (x-32)-x
+ Function Raw bytes Checksum offload header
+
+Command (1)/ Data (0) bit value is to indicate if the packet is a MAP command
+or data packet. Command packet is used for transport level flow control. Data
+packets are standard IP packets.
+
+Reserved bits must be zero when sent and ignored when received.
+
+Padding is the number of bytes to be appended to the payload to
+ensure 4 byte alignment.
+
+Multiplexer ID is to indicate the PDN on which data has to be sent.
+
+Payload length includes the padding length but does not include MAP header
+length.
+
+Checksum offload header, has the information about the checksum processing done
+by the hardware.Checksum offload header fields are in big endian format.
+
+Packet format::
+
+ Bit 0-14 15 16-31
+ Function Reserved Valid Checksum start offset
+
+ Bit 31-47 48-64
+ Function Checksum length Checksum value
+
+Reserved bits must be zero when sent and ignored when received.
+
+Valid bit indicates whether the partial checksum is calculated and is valid.
+Set to 1, if its is valid. Set to 0 otherwise.
+
+Padding is the number of bytes to be appended to the payload to
+ensure 4 byte alignment.
+
+Checksum start offset, Indicates the offset in bytes from the beginning of the
+IP header, from which modem computed checksum.
+
+Checksum length is the Length in bytes starting from CKSUM_START_OFFSET,
+over which checksum is computed.
+
+Checksum value, indicates the checksum computed.
+
+c. MAP packet v5 (data / control)
+
+MAP header fields are in big endian format.
+
+Packet format::
+
+ Bit 0 1 2-7 8-15 16-31
+ Function Command / Data Next header Pad Multiplexer ID Payload length
+
+ Bit 32-x
+ Function Raw bytes
+
+Command (1)/ Data (0) bit value is to indicate if the packet is a MAP command
+or data packet. Command packet is used for transport level flow control. Data
+packets are standard IP packets.
+
+Next header is used to indicate the presence of another header, currently is
+limited to checksum header.
+
+Padding is the number of bytes to be appended to the payload to
+ensure 4 byte alignment.
+
+Multiplexer ID is to indicate the PDN on which data has to be sent.
+
+Payload length includes the padding length but does not include MAP header
+length.
+
+d. Checksum offload header v5
+
+Checksum offload header fields are in big endian format.
+
+ Bit 0 - 6 7 8-15 16-31
+ Function Header Type Next Header Checksum Valid Reserved
+
+Header Type is to indicate the type of header, this usually is set to CHECKSUM
+
+Header types
+= ==========================================
+0 Reserved
+1 Reserved
+2 checksum header
+
+Checksum Valid is to indicate whether the header checksum is valid. Value of 1
+implies that checksum is calculated on this packet and is valid, value of 0
+indicates that the calculated packet checksum is invalid.
+
+Reserved bits must be zero when sent and ignored when received.
+
+e. MAP packet v1/v5 (command specific)::
- Bit 0 1 2-7 8 - 15 16 - 31
+ Bit 0 1 2-7 8 - 15 16 - 31
Function Command Reserved Pad Multiplexer ID Payload length
Bit 32 - 39 40 - 45 46 - 47 48 - 63
Function Command name Reserved Command Type Reserved
@@ -74,7 +176,7 @@ Command types
3 is for error during processing of commands
= ==========================================
-c. Aggregation
+f. Aggregation
Aggregation is multiple MAP packets (can be data or command) delivered to
rmnet in a single linear skb. rmnet will process the individual
diff --git a/Documentation/networking/device_drivers/ethernet/amazon/ena.rst b/Documentation/networking/device_drivers/ethernet/amazon/ena.rst
index f8c6469f2bd2..01b2a69b0cb0 100644
--- a/Documentation/networking/device_drivers/ethernet/amazon/ena.rst
+++ b/Documentation/networking/device_drivers/ethernet/amazon/ena.rst
@@ -11,12 +11,12 @@ ENA is a networking interface designed to make good use of modern CPU
features and system architectures.
The ENA device exposes a lightweight management interface with a
-minimal set of memory mapped registers and extendable command set
+minimal set of memory mapped registers and extendible command set
through an Admin Queue.
The driver supports a range of ENA devices, is link-speed independent
-(i.e., the same driver is used for 10GbE, 25GbE, 40GbE, etc.), and has
-a negotiated and extendable feature set.
+(i.e., the same driver is used for 10GbE, 25GbE, 40GbE, etc), and has
+a negotiated and extendible feature set.
Some ENA devices support SR-IOV. This driver is used for both the
SR-IOV Physical Function (PF) and Virtual Function (VF) devices.
@@ -27,9 +27,9 @@ is advertised by the device via the Admin Queue), a dedicated MSI-X
interrupt vector per Tx/Rx queue pair, adaptive interrupt moderation,
and CPU cacheline optimized data placement.
-The ENA driver supports industry standard TCP/IP offload features such
-as checksum offload and TCP transmit segmentation offload (TSO).
-Receive-side scaling (RSS) is supported for multi-core scaling.
+The ENA driver supports industry standard TCP/IP offload features such as
+checksum offload. Receive-side scaling (RSS) is supported for multi-core
+scaling.
The ENA driver and its corresponding devices implement health
monitoring mechanisms such as watchdog, enabling the device and driver
@@ -38,22 +38,20 @@ debug logs.
Some of the ENA devices support a working mode called Low-latency
Queue (LLQ), which saves several more microseconds.
-
ENA Source Code Directory Structure
===================================
================= ======================================================
ena_com.[ch] Management communication layer. This layer is
- responsible for the handling all the management
- (admin) communication between the device and the
- driver.
+ responsible for the handling all the management
+ (admin) communication between the device and the
+ driver.
ena_eth_com.[ch] Tx/Rx data path.
ena_admin_defs.h Definition of ENA management interface.
ena_eth_io_defs.h Definition of ENA data path interface.
ena_common_defs.h Common definitions for ena_com layer.
ena_regs_defs.h Definition of ENA PCI memory-mapped (MMIO) registers.
ena_netdev.[ch] Main Linux kernel driver.
-ena_syfsfs.[ch] Sysfs files.
ena_ethtool.c ethtool callbacks.
ena_pci_id_tbl.h Supported device IDs.
================= ======================================================
@@ -69,7 +67,7 @@ ENA management interface is exposed by means of:
- Asynchronous Event Notification Queue (AENQ)
ENA device MMIO Registers are accessed only during driver
-initialization and are not involved in further normal device
+initialization and are not used during further normal device
operation.
AQ is used for submitting management commands, and the
@@ -100,28 +98,27 @@ group may have multiple syndromes, as shown below
The events are:
- ==================== ===============
- Group Syndrome
- ==================== ===============
- Link state change **X**
- Fatal error **X**
- Notification Suspend traffic
- Notification Resume traffic
- Keep-Alive **X**
- ==================== ===============
+==================== ===============
+Group Syndrome
+==================== ===============
+Link state change **X**
+Fatal error **X**
+Notification Suspend traffic
+Notification Resume traffic
+Keep-Alive **X**
+==================== ===============
ACQ and AENQ share the same MSI-X vector.
-Keep-Alive is a special mechanism that allows monitoring of the
-device's health. The driver maintains a watchdog (WD) handler which,
-if fired, logs the current state and statistics then resets and
-restarts the ENA device and driver. A Keep-Alive event is delivered by
-the device every second. The driver re-arms the WD upon reception of a
-Keep-Alive event. A missed Keep-Alive event causes the WD handler to
-fire.
+Keep-Alive is a special mechanism that allows monitoring the device's health.
+A Keep-Alive event is delivered by the device every second.
+The driver maintains a watchdog (WD) handler which logs the current state and
+statistics. If the keep-alive events aren't delivered as expected the WD resets
+the device and the driver.
Data Path Interface
===================
+
I/O operations are based on Tx and Rx Submission Queues (Tx SQ and Rx
SQ correspondingly). Each SQ has a completion queue (CQ) associated
with it.
@@ -131,26 +128,24 @@ physical memory.
The ENA driver supports two Queue Operation modes for Tx SQs:
-- Regular mode
+- **Regular mode:**
+ In this mode the Tx SQs reside in the host's memory. The ENA
+ device fetches the ENA Tx descriptors and packet data from host
+ memory.
- * In this mode the Tx SQs reside in the host's memory. The ENA
- device fetches the ENA Tx descriptors and packet data from host
- memory.
+- **Low Latency Queue (LLQ) mode or "push-mode":**
+ In this mode the driver pushes the transmit descriptors and the
+ first 128 bytes of the packet directly to the ENA device memory
+ space. The rest of the packet payload is fetched by the
+ device. For this operation mode, the driver uses a dedicated PCI
+ device memory BAR, which is mapped with write-combine capability.
-- Low Latency Queue (LLQ) mode or "push-mode".
-
- * In this mode the driver pushes the transmit descriptors and the
- first 128 bytes of the packet directly to the ENA device memory
- space. The rest of the packet payload is fetched by the
- device. For this operation mode, the driver uses a dedicated PCI
- device memory BAR, which is mapped with write-combine capability.
+ **Note that** not all ENA devices support LLQ, and this feature is negotiated
+ with the device upon initialization. If the ENA device does not
+ support LLQ mode, the driver falls back to the regular mode.
The Rx SQs support only the regular mode.
-Note: Not all ENA devices support LLQ, and this feature is negotiated
- with the device upon initialization. If the ENA device does not
- support LLQ mode, the driver falls back to the regular mode.
-
The driver supports multi-queue for both Tx and Rx. This has various
benefits:
@@ -165,6 +160,7 @@ benefits:
Interrupt Modes
===============
+
The driver assigns a single MSI-X vector per queue pair (for both Tx
and Rx directions). The driver assigns an additional dedicated MSI-X vector
for management (for ACQ and AENQ).
@@ -190,20 +186,21 @@ unmasked by the driver after NAPI processing is complete.
Interrupt Moderation
====================
+
ENA driver and device can operate in conventional or adaptive interrupt
moderation mode.
-In conventional mode the driver instructs device to postpone interrupt
+**In conventional mode** the driver instructs device to postpone interrupt
posting according to static interrupt delay value. The interrupt delay
-value can be configured through ethtool(8). The following ethtool
-parameters are supported by the driver: tx-usecs, rx-usecs
+value can be configured through `ethtool(8)`. The following `ethtool`
+parameters are supported by the driver: ``tx-usecs``, ``rx-usecs``
-In adaptive interrupt moderation mode the interrupt delay value is
+**In adaptive interrupt** moderation mode the interrupt delay value is
updated by the driver dynamically and adjusted every NAPI cycle
according to the traffic nature.
-Adaptive coalescing can be switched on/off through ethtool(8)
-adaptive_rx on|off parameter.
+Adaptive coalescing can be switched on/off through `ethtool(8)`'s
+:code:`adaptive_rx on|off` parameter.
More information about Adaptive Interrupt Moderation (DIM) can be found in
Documentation/networking/net_dim.rst
@@ -214,17 +211,10 @@ The rx_copybreak is initialized by default to ENA_DEFAULT_RX_COPYBREAK
and can be configured by the ETHTOOL_STUNABLE command of the
SIOCETHTOOL ioctl.
-SKB
-===
-The driver-allocated SKB for frames received from Rx handling using
-NAPI context. The allocation method depends on the size of the packet.
-If the frame length is larger than rx_copybreak, napi_get_frags()
-is used, otherwise netdev_alloc_skb_ip_align() is used, the buffer
-content is copied (by CPU) to the SKB, and the buffer is recycled.
-
Statistics
==========
-The user can obtain ENA device and driver statistics using ethtool.
+
+The user can obtain ENA device and driver statistics using `ethtool`.
The driver can collect regular or extended statistics (including
per-queue stats) from the device.
@@ -232,22 +222,23 @@ In addition the driver logs the stats to syslog upon device reset.
MTU
===
+
The driver supports an arbitrarily large MTU with a maximum that is
negotiated with the device. The driver configures MTU using the
SetFeature command (ENA_ADMIN_MTU property). The user can change MTU
-via ip(8) and similar legacy tools.
+via `ip(8)` and similar legacy tools.
Stateless Offloads
==================
+
The ENA driver supports:
-- TSO over IPv4/IPv6
-- TSO with ECN
- IPv4 header checksum offload
- TCP/UDP over IPv4/IPv6 checksum offloads
RSS
===
+
- The ENA device supports RSS that allows flexible Rx traffic
steering.
- Toeplitz and CRC32 hash functions are supported.
@@ -260,41 +251,42 @@ RSS
function delivered in the Rx CQ descriptor is set in the received
SKB.
- The user can provide a hash key, hash function, and configure the
- indirection table through ethtool(8).
+ indirection table through `ethtool(8)`.
DATA PATH
=========
+
Tx
--
-ena_start_xmit() is called by the stack. This function does the following:
+:code:`ena_start_xmit()` is called by the stack. This function does the following:
-- Maps data buffers (skb->data and frags).
-- Populates ena_buf for the push buffer (if the driver and device are
- in push mode.)
+- Maps data buffers (``skb->data`` and frags).
+- Populates ``ena_buf`` for the push buffer (if the driver and device are
+ in push mode).
- Prepares ENA bufs for the remaining frags.
-- Allocates a new request ID from the empty req_id ring. The request
+- Allocates a new request ID from the empty ``req_id`` ring. The request
ID is the index of the packet in the Tx info. This is used for
- out-of-order TX completions.
+ out-of-order Tx completions.
- Adds the packet to the proper place in the Tx ring.
-- Calls ena_com_prepare_tx(), an ENA communication layer that converts
- the ena_bufs to ENA descriptors (and adds meta ENA descriptors as
- needed.)
+- Calls :code:`ena_com_prepare_tx()`, an ENA communication layer that converts
+ the ``ena_bufs`` to ENA descriptors (and adds meta ENA descriptors as
+ needed).
* This function also copies the ENA descriptors and the push buffer
- to the Device memory space (if in push mode.)
+ to the Device memory space (if in push mode).
-- Writes doorbell to the ENA device.
+- Writes a doorbell to the ENA device.
- When the ENA device finishes sending the packet, a completion
interrupt is raised.
- The interrupt handler schedules NAPI.
-- The ena_clean_tx_irq() function is called. This function handles the
+- The :code:`ena_clean_tx_irq()` function is called. This function handles the
completion descriptors generated by the ENA, with a single
completion descriptor per completed packet.
- * req_id is retrieved from the completion descriptor. The tx_info of
- the packet is retrieved via the req_id. The data buffers are
- unmapped and req_id is returned to the empty req_id ring.
+ * ``req_id`` is retrieved from the completion descriptor. The ``tx_info`` of
+ the packet is retrieved via the ``req_id``. The data buffers are
+ unmapped and ``req_id`` is returned to the empty ``req_id`` ring.
* The function stops when the completion descriptors are completed or
the budget is reached.
@@ -303,12 +295,11 @@ Rx
- When a packet is received from the ENA device.
- The interrupt handler schedules NAPI.
-- The ena_clean_rx_irq() function is called. This function calls
- ena_rx_pkt(), an ENA communication layer function, which returns the
- number of descriptors used for a new unhandled packet, and zero if
+- The :code:`ena_clean_rx_irq()` function is called. This function calls
+ :code:`ena_com_rx_pkt()`, an ENA communication layer function, which returns the
+ number of descriptors used for a new packet, and zero if
no new packet is found.
-- Then it calls the ena_clean_rx_irq() function.
-- ena_eth_rx_skb() checks packet length:
+- :code:`ena_rx_skb()` checks packet length:
* If the packet is small (len < rx_copybreak), the driver allocates
a SKB for the new packet, and copies the packet payload into the
@@ -317,9 +308,10 @@ Rx
- In this way the original data buffer is not passed to the stack
and is reused for future Rx packets.
- * Otherwise the function unmaps the Rx buffer, then allocates the
- new SKB structure and hooks the Rx buffer to the SKB frags.
+ * Otherwise the function unmaps the Rx buffer, sets the first
+ descriptor as `skb`'s linear part and the other descriptors as the
+ `skb`'s frags.
- The new SKB is updated with the necessary information (protocol,
- checksum hw verify result, etc.), and then passed to the network
- stack, using the NAPI interface function napi_gro_receive().
+ checksum hw verify result, etc), and then passed to the network
+ stack, using the NAPI interface function :code:`napi_gro_receive()`.
diff --git a/Documentation/networking/device_drivers/ethernet/freescale/dpaa2/dpio-driver.rst b/Documentation/networking/device_drivers/ethernet/freescale/dpaa2/dpio-driver.rst
index c50fd46631e0..e4ebfe62a183 100644
--- a/Documentation/networking/device_drivers/ethernet/freescale/dpaa2/dpio-driver.rst
+++ b/Documentation/networking/device_drivers/ethernet/freescale/dpaa2/dpio-driver.rst
@@ -1,5 +1,6 @@
.. include:: <isonum.txt>
+===================================
DPAA2 DPIO (Data Path I/O) Overview
===================================
diff --git a/Documentation/networking/device_drivers/ethernet/freescale/dpaa2/index.rst b/Documentation/networking/device_drivers/ethernet/freescale/dpaa2/index.rst
index ee40fcc5ddff..62f4a4aff6ec 100644
--- a/Documentation/networking/device_drivers/ethernet/freescale/dpaa2/index.rst
+++ b/Documentation/networking/device_drivers/ethernet/freescale/dpaa2/index.rst
@@ -9,3 +9,4 @@ DPAA2 Documentation
dpio-driver
ethernet-driver
mac-phy-support
+ switch-driver
diff --git a/Documentation/networking/device_drivers/ethernet/freescale/dpaa2/switch-driver.rst b/Documentation/networking/device_drivers/ethernet/freescale/dpaa2/switch-driver.rst
new file mode 100644
index 000000000000..8bf411b857d4
--- /dev/null
+++ b/Documentation/networking/device_drivers/ethernet/freescale/dpaa2/switch-driver.rst
@@ -0,0 +1,217 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+===================
+DPAA2 Switch driver
+===================
+
+:Copyright: |copy| 2021 NXP
+
+The DPAA2 Switch driver probes on the Datapath Switch (DPSW) object which can
+be instantiated on the following DPAA2 SoCs and their variants: LS2088A and
+LX2160A.
+
+The driver uses the switch device driver model and exposes each switch port as
+a network interface, which can be included in a bridge or used as a standalone
+interface. Traffic switched between ports is offloaded into the hardware.
+
+The DPSW can have ports connected to DPNIs or to DPMACs for external access.
+::
+
+ [ethA] [ethB] [ethC] [ethD] [ethE] [ethF]
+ : : : : : :
+ : : : : : :
+ [dpaa2-eth] [dpaa2-eth] [ dpaa2-switch ]
+ : : : : : : kernel
+ =============================================================================
+ : : : : : : hardware
+ [DPNI] [DPNI] [============= DPSW =================]
+ | | | | | |
+ | ---------- | [DPMAC] [DPMAC]
+ ------------------------------- | |
+ | |
+ [PHY] [PHY]
+
+Creating an Ethernet Switch
+===========================
+
+The dpaa2-switch driver probes on DPSW devices found on the fsl-mc bus. These
+devices can be either created statically through the boot time configuration
+file - DataPath Layout (DPL) - or at runtime using the DPAA2 object APIs
+(incorporated already into the restool userspace tool).
+
+At the moment, the dpaa2-switch driver imposes the following restrictions on
+the DPSW object that it will probe:
+
+ * The minimum number of FDBs should be at least equal to the number of switch
+ interfaces. This is necessary so that separation of switch ports can be
+ done, ie when not under a bridge, each switch port will have its own FDB.
+ ::
+
+ fsl_dpaa2_switch dpsw.0: The number of FDBs is lower than the number of ports, cannot probe
+
+ * Both the broadcast and flooding configuration should be per FDB. This
+ enables the driver to restrict the broadcast and flooding domains of each
+ FDB depending on the switch ports that are sharing it (aka are under the
+ same bridge).
+ ::
+
+ fsl_dpaa2_switch dpsw.0: Flooding domain is not per FDB, cannot probe
+ fsl_dpaa2_switch dpsw.0: Broadcast domain is not per FDB, cannot probe
+
+ * The control interface of the switch should not be disabled
+ (DPSW_OPT_CTRL_IF_DIS not passed as a create time option). Without the
+ control interface, the driver is not capable to provide proper Rx/Tx traffic
+ support on the switch port netdevices.
+ ::
+
+ fsl_dpaa2_switch dpsw.0: Control Interface is disabled, cannot probe
+
+Besides the configuration of the actual DPSW object, the dpaa2-switch driver
+will need the following DPAA2 objects:
+
+ * 1 DPMCP - A Management Command Portal object is needed for any interraction
+ with the MC firmware.
+
+ * 1 DPBP - A Buffer Pool is used for seeding buffers intended for the Rx path
+ on the control interface.
+
+ * Access to at least one DPIO object (Software Portal) is needed for any
+ enqueue/dequeue operation to be performed on the control interface queues.
+ The DPIO object will be shared, no need for a private one.
+
+Switching features
+==================
+
+The driver supports the configuration of L2 forwarding rules in hardware for
+port bridging as well as standalone usage of the independent switch interfaces.
+
+The hardware is not configurable with respect to VLAN awareness, thus any DPAA2
+switch port should be used only in usecases with a VLAN aware bridge::
+
+ $ ip link add dev br0 type bridge vlan_filtering 1
+
+ $ ip link add dev br1 type bridge
+ $ ip link set dev ethX master br1
+ Error: fsl_dpaa2_switch: Cannot join a VLAN-unaware bridge
+
+Topology and loop detection through STP is supported when ``stp_state 1`` is
+used at bridge create ::
+
+ $ ip link add dev br0 type bridge vlan_filtering 1 stp_state 1
+
+L2 FDB manipulation (add/delete/dump) is supported.
+
+HW FDB learning can be configured on each switch port independently through
+bridge commands. When the HW learning is disabled, a fast age procedure will be
+run and any previously learnt addresses will be removed.
+::
+
+ $ bridge link set dev ethX learning off
+ $ bridge link set dev ethX learning on
+
+Restricting the unknown unicast and multicast flooding domain is supported, but
+not independently of each other::
+
+ $ ip link set dev ethX type bridge_slave flood off mcast_flood off
+ $ ip link set dev ethX type bridge_slave flood off mcast_flood on
+ Error: fsl_dpaa2_switch: Cannot configure multicast flooding independently of unicast.
+
+Broadcast flooding on a switch port can be disabled/enabled through the brport sysfs::
+
+ $ echo 0 > /sys/bus/fsl-mc/devices/dpsw.Y/net/ethX/brport/broadcast_flood
+
+Offloads
+========
+
+Routing actions (redirect, trap, drop)
+--------------------------------------
+
+The DPAA2 switch is able to offload flow-based redirection of packets making
+use of ACL tables. Shared filter blocks are supported by sharing a single ACL
+table between multiple ports.
+
+The following flow keys are supported:
+
+ * Ethernet: dst_mac/src_mac
+ * IPv4: dst_ip/src_ip/ip_proto/tos
+ * VLAN: vlan_id/vlan_prio/vlan_tpid/vlan_dei
+ * L4: dst_port/src_port
+
+Also, the matchall filter can be used to redirect the entire traffic received
+on a port.
+
+As per flow actions, the following are supported:
+
+ * drop
+ * mirred egress redirect
+ * trap
+
+Each ACL entry (filter) can be setup with only one of the listed
+actions.
+
+Example 1: send frames received on eth4 with a SA of 00:01:02:03:04:05 to the
+CPU::
+
+ $ tc qdisc add dev eth4 clsact
+ $ tc filter add dev eth4 ingress flower src_mac 00:01:02:03:04:05 skip_sw action trap
+
+Example 2: drop frames received on eth4 with VID 100 and PCP of 3::
+
+ $ tc filter add dev eth4 ingress protocol 802.1q flower skip_sw vlan_id 100 vlan_prio 3 action drop
+
+Example 3: redirect all frames received on eth4 to eth1::
+
+ $ tc filter add dev eth4 ingress matchall action mirred egress redirect dev eth1
+
+Example 4: Use a single shared filter block on both eth5 and eth6::
+
+ $ tc qdisc add dev eth5 ingress_block 1 clsact
+ $ tc qdisc add dev eth6 ingress_block 1 clsact
+ $ tc filter add block 1 ingress flower dst_mac 00:01:02:03:04:04 skip_sw \
+ action trap
+ $ tc filter add block 1 ingress protocol ipv4 flower src_ip 192.168.1.1 skip_sw \
+ action mirred egress redirect dev eth3
+
+Mirroring
+~~~~~~~~~
+
+The DPAA2 switch supports only per port mirroring and per VLAN mirroring.
+Adding mirroring filters in shared blocks is also supported.
+
+When using the tc-flower classifier with the 802.1q protocol, only the
+''vlan_id'' key will be accepted. Mirroring based on any other fields from the
+802.1q protocol will be rejected::
+
+ $ tc qdisc add dev eth8 ingress_block 1 clsact
+ $ tc filter add block 1 ingress protocol 802.1q flower skip_sw vlan_prio 3 action mirred egress mirror dev eth6
+ Error: fsl_dpaa2_switch: Only matching on VLAN ID supported.
+ We have an error talking to the kernel
+
+If a mirroring VLAN filter is requested on a port, the VLAN must to be
+installed on the switch port in question either using ''bridge'' or by creating
+a VLAN upper device if the switch port is used as a standalone interface::
+
+ $ tc qdisc add dev eth8 ingress_block 1 clsact
+ $ tc filter add block 1 ingress protocol 802.1q flower skip_sw vlan_id 200 action mirred egress mirror dev eth6
+ Error: VLAN must be installed on the switch port.
+ We have an error talking to the kernel
+
+ $ bridge vlan add vid 200 dev eth8
+ $ tc filter add block 1 ingress protocol 802.1q flower skip_sw vlan_id 200 action mirred egress mirror dev eth6
+
+ $ ip link add link eth8 name eth8.200 type vlan id 200
+ $ tc filter add block 1 ingress protocol 802.1q flower skip_sw vlan_id 200 action mirred egress mirror dev eth6
+
+Also, it should be noted that the mirrored traffic will be subject to the same
+egress restrictions as any other traffic. This means that when a mirrored
+packet will reach the mirror port, if the VLAN found in the packet is not
+installed on the port it will get dropped.
+
+The DPAA2 switch supports only a single mirroring destination, thus multiple
+mirror rules can be installed but their ''to'' port has to be the same::
+
+ $ tc filter add block 1 ingress protocol 802.1q flower skip_sw vlan_id 200 action mirred egress mirror dev eth6
+ $ tc filter add block 1 ingress protocol 802.1q flower skip_sw vlan_id 100 action mirred egress mirror dev eth7
+ Error: fsl_dpaa2_switch: Multiple mirror ports not supported.
+ We have an error talking to the kernel
diff --git a/Documentation/networking/device_drivers/ethernet/google/gve.rst b/Documentation/networking/device_drivers/ethernet/google/gve.rst
index 793693cef6e3..6d73ee78f3d7 100644
--- a/Documentation/networking/device_drivers/ethernet/google/gve.rst
+++ b/Documentation/networking/device_drivers/ethernet/google/gve.rst
@@ -47,13 +47,24 @@ The driver interacts with the device in the following ways:
- Transmit and Receive Queues
- See description below
+Descriptor Formats
+------------------
+GVE supports two descriptor formats: GQI and DQO. These two formats have
+entirely different descriptors, which will be described below.
+
Registers
---------
-All registers are MMIO and big endian.
+All registers are MMIO.
The registers are used for initializing and configuring the device as well as
querying device status in response to management interrupts.
+Endianness
+----------
+- Admin Queue messages and registers are all Big Endian.
+- GQI descriptors and datapath registers are Big Endian.
+- DQO descriptors and datapath registers are Little Endian.
+
Admin Queue (AQ)
----------------
The Admin Queue is a PAGE_SIZE memory block, treated as an array of AQ
@@ -97,10 +108,10 @@ the queues associated with that interrupt.
The handler for these irqs schedule the napi for that block to run
and poll the queues.
-Traffic Queues
---------------
-gVNIC's queues are composed of a descriptor ring and a buffer and are
-assigned to a notification block.
+GQI Traffic Queues
+------------------
+GQI queues are composed of a descriptor ring and a buffer and are assigned to a
+notification block.
The descriptor rings are power-of-two-sized ring buffers consisting of
fixed-size descriptors. They advance their head pointer using a __be32
@@ -121,3 +132,35 @@ Receive
The buffers for receive rings are put into a data ring that is the same
length as the descriptor ring and the head and tail pointers advance over
the rings together.
+
+DQO Traffic Queues
+------------------
+- Every TX and RX queue is assigned a notification block.
+
+- TX and RX buffers queues, which send descriptors to the device, use MMIO
+ doorbells to notify the device of new descriptors.
+
+- RX and TX completion queues, which receive descriptors from the device, use a
+ "generation bit" to know when a descriptor was populated by the device. The
+ driver initializes all bits with the "current generation". The device will
+ populate received descriptors with the "next generation" which is inverted
+ from the current generation. When the ring wraps, the current/next generation
+ are swapped.
+
+- It's the driver's responsibility to ensure that the RX and TX completion
+ queues are not overrun. This can be accomplished by limiting the number of
+ descriptors posted to HW.
+
+- TX packets have a 16 bit completion_tag and RX buffers have a 16 bit
+ buffer_id. These will be returned on the TX completion and RX queues
+ respectively to let the driver know which packet/buffer was completed.
+
+Transmit
+~~~~~~~~
+A packet's buffers are DMA mapped for the device to access before transmission.
+After the packet was successfully transmitted, the buffers are unmapped.
+
+Receive
+~~~~~~~
+The driver posts fixed sized buffers to HW on the RX buffer queue. The packet
+received on the associated RX queue may span multiple descriptors.
diff --git a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
index 2d3f6bd969a2..ac35bd472bdc 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
@@ -466,7 +466,7 @@ network. PTP support varies among Intel devices that support this driver. Use
"ethtool -T <netdev name>" to get a definitive list of PTP capabilities
supported by the device.
-IEEE 802.1ad (QinQ) Support
+IEEE 802.1ad (QinQ) Support
---------------------------
The IEEE 802.1ad standard, informally known as QinQ, allows for multiple VLAN
IDs within a single Ethernet frame. VLAN IDs are sometimes referred to as
@@ -523,8 +523,8 @@ of a port's bandwidth (should it be available). The sum of all the values for
Maximum Bandwidth is not restricted, because no more than 100% of a port's
bandwidth can ever be used.
-NOTE: X710/XXV710 devices fail to enable Max VFs (64) when Multiple Functions
-per Port (MFP) and SR-IOV are enabled. An error from i40e is logged that says
+NOTE: X710/XXV710 devices fail to enable Max VFs (64) when Multiple Functions
+per Port (MFP) and SR-IOV are enabled. An error from i40e is logged that says
"add vsi failed for VF N, aq_err 16". To workaround the issue, enable less than
64 virtual functions (VFs).
diff --git a/Documentation/networking/device_drivers/ethernet/intel/iavf.rst b/Documentation/networking/device_drivers/ethernet/intel/iavf.rst
index 25330b7b5168..151af0a8da9c 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/iavf.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/iavf.rst
@@ -113,7 +113,7 @@ which the AVF is associated. The following are base mode features:
- AVF device ID
- HW mailbox is used for VF to PF communications (including on Windows)
-IEEE 802.1ad (QinQ) Support
+IEEE 802.1ad (QinQ) Support
---------------------------
The IEEE 802.1ad standard, informally known as QinQ, allows for multiple VLAN
IDs within a single Ethernet frame. VLAN IDs are sometimes referred to as
diff --git a/Documentation/networking/device_drivers/ethernet/intel/ice.rst b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
index e7d9cbff771b..67b7a701ce9e 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/ice.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
@@ -851,7 +851,7 @@ NOTES:
- 0x88A8 traffic will not be received unless VLAN stripping is disabled with
the following command::
- # ethool -K <ethX> rxvlan off
+ # ethtool -K <ethX> rxvlan off
- 0x88A8/0x8100 double VLANs cannot be used with 0x8100 or 0x8100/0x8100 VLANS
configured on the same port. 0x88a8/0x8100 traffic will not be received if
diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst
index 936a10f1942c..4b59cf2c599f 100644
--- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst
+++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst
@@ -12,6 +12,7 @@ Contents
- `Enabling the driver and kconfig options`_
- `Devlink info`_
- `Devlink parameters`_
+- `Bridge offload`_
- `mlx5 subfunction`_
- `mlx5 function attributes`_
- `Devlink health reporters`_
@@ -217,6 +218,37 @@ users try to enable them.
$ devlink dev eswitch set pci/0000:06:00.0 mode switchdev
+Bridge offload
+==============
+The mlx5 driver implements support for offloading bridge rules when in switchdev
+mode. Linux bridge FDBs are automatically offloaded when mlx5 switchdev
+representor is attached to bridge.
+
+- Change device to switchdev mode::
+
+ $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev
+
+- Attach mlx5 switchdev representor 'enp8s0f0' to bridge netdev 'bridge1'::
+
+ $ ip link set enp8s0f0 master bridge1
+
+VLANs
+-----
+Following bridge VLAN functions are supported by mlx5:
+
+- VLAN filtering (including multiple VLANs per port)::
+
+ $ ip link set bridge1 type bridge vlan_filtering 1
+ $ bridge vlan add dev enp8s0f0 vid 2-3
+
+- VLAN push on bridge ingress::
+
+ $ bridge vlan add dev enp8s0f0 vid 3 pvid
+
+- VLAN pop on bridge egress::
+
+ $ bridge vlan add dev enp8s0f0 vid 3 untagged
+
mlx5 subfunction
================
mlx5 supports subfunction management using devlink port (see :ref:`Documentation/networking/devlink/devlink-port.rst <devlink_port>`) interface.
@@ -568,3 +600,103 @@ tc and eswitch offloads tracepoints:
$ cat /sys/kernel/debug/tracing/trace
...
kworker/u48:7-2221 [009] ...1 1475.387435: mlx5e_rep_neigh_update: netdev: ens1f0 MAC: 24:8a:07:9a:17:9a IPv4: 1.1.1.10 IPv6: ::ffff:1.1.1.10 neigh_connected=1
+
+Bridge offloads tracepoints:
+
+- mlx5_esw_bridge_fdb_entry_init: trace bridge FDB entry offloaded to mlx5::
+
+ $ echo mlx5:mlx5_esw_bridge_fdb_entry_init >> set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ kworker/u20:9-2217 [003] ...1 318.582243: mlx5_esw_bridge_fdb_entry_init: net_device=enp8s0f0_0 addr=e4:fd:05:08:00:02 vid=0 flags=0 used=0
+
+- mlx5_esw_bridge_fdb_entry_cleanup: trace bridge FDB entry deleted from mlx5::
+
+ $ echo mlx5:mlx5_esw_bridge_fdb_entry_cleanup >> set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ ip-2581 [005] ...1 318.629871: mlx5_esw_bridge_fdb_entry_cleanup: net_device=enp8s0f0_1 addr=e4:fd:05:08:00:03 vid=0 flags=0 used=16
+
+- mlx5_esw_bridge_fdb_entry_refresh: trace bridge FDB entry offload refreshed in
+ mlx5::
+
+ $ echo mlx5:mlx5_esw_bridge_fdb_entry_refresh >> set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ kworker/u20:8-3849 [003] ...1 466716: mlx5_esw_bridge_fdb_entry_refresh: net_device=enp8s0f0_0 addr=e4:fd:05:08:00:02 vid=3 flags=0 used=0
+
+- mlx5_esw_bridge_vlan_create: trace bridge VLAN object add on mlx5
+ representor::
+
+ $ echo mlx5:mlx5_esw_bridge_vlan_create >> set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ ip-2560 [007] ...1 318.460258: mlx5_esw_bridge_vlan_create: vid=1 flags=6
+
+- mlx5_esw_bridge_vlan_cleanup: trace bridge VLAN object delete from mlx5
+ representor::
+
+ $ echo mlx5:mlx5_esw_bridge_vlan_cleanup >> set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ bridge-2582 [007] ...1 318.653496: mlx5_esw_bridge_vlan_cleanup: vid=2 flags=8
+
+- mlx5_esw_bridge_vport_init: trace mlx5 vport assigned with bridge upper
+ device::
+
+ $ echo mlx5:mlx5_esw_bridge_vport_init >> set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ ip-2560 [007] ...1 318.458915: mlx5_esw_bridge_vport_init: vport_num=1
+
+- mlx5_esw_bridge_vport_cleanup: trace mlx5 vport removed from bridge upper
+ device::
+
+ $ echo mlx5:mlx5_esw_bridge_vport_cleanup >> set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ ip-5387 [000] ...1 573713: mlx5_esw_bridge_vport_cleanup: vport_num=1
+
+Eswitch QoS tracepoints:
+
+- mlx5_esw_vport_qos_create: trace creation of transmit scheduler arbiter for vport::
+
+ $ echo mlx5:mlx5_esw_vport_qos_create >> /sys/kernel/debug/tracing/set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ <...>-23496 [018] .... 73136.838831: mlx5_esw_vport_qos_create: (0000:82:00.0) vport=2 tsar_ix=4 bw_share=0, max_rate=0 group=000000007b576bb3
+
+- mlx5_esw_vport_qos_config: trace configuration of transmit scheduler arbiter for vport::
+
+ $ echo mlx5:mlx5_esw_vport_qos_config >> /sys/kernel/debug/tracing/set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ <...>-26548 [023] .... 75754.223823: mlx5_esw_vport_qos_config: (0000:82:00.0) vport=1 tsar_ix=3 bw_share=34, max_rate=10000 group=000000007b576bb3
+
+- mlx5_esw_vport_qos_destroy: trace deletion of transmit scheduler arbiter for vport::
+
+ $ echo mlx5:mlx5_esw_vport_qos_destroy >> /sys/kernel/debug/tracing/set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ <...>-27418 [004] .... 76546.680901: mlx5_esw_vport_qos_destroy: (0000:82:00.0) vport=1 tsar_ix=3
+
+- mlx5_esw_group_qos_create: trace creation of transmit scheduler arbiter for rate group::
+
+ $ echo mlx5:mlx5_esw_group_qos_create >> /sys/kernel/debug/tracing/set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ <...>-26578 [008] .... 75776.022112: mlx5_esw_group_qos_create: (0000:82:00.0) group=000000008dac63ea tsar_ix=5
+
+- mlx5_esw_group_qos_config: trace configuration of transmit scheduler arbiter for rate group::
+
+ $ echo mlx5:mlx5_esw_group_qos_config >> /sys/kernel/debug/tracing/set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ <...>-27303 [020] .... 76461.455356: mlx5_esw_group_qos_config: (0000:82:00.0) group=000000008dac63ea tsar_ix=5 bw_share=100 max_rate=20000
+
+- mlx5_esw_group_qos_destroy: trace deletion of transmit scheduler arbiter for group::
+
+ $ echo mlx5:mlx5_esw_group_qos_destroy >> /sys/kernel/debug/tracing/set_event
+ $ cat /sys/kernel/debug/tracing/trace
+ ...
+ <...>-27418 [006] .... 76547.187258: mlx5_esw_group_qos_destroy: (0000:82:00.0) group=000000007b576bb3 tsar_ix=1
diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst
index d8279de7bf25..3a5a1d46e77e 100644
--- a/Documentation/networking/device_drivers/index.rst
+++ b/Documentation/networking/device_drivers/index.rst
@@ -18,6 +18,7 @@ Contents:
qlogic/index
wan/index
wifi/index
+ wwan/index
.. only:: subproject and html
diff --git a/Documentation/networking/device_drivers/wwan/index.rst b/Documentation/networking/device_drivers/wwan/index.rst
new file mode 100644
index 000000000000..1cb8c7371401
--- /dev/null
+++ b/Documentation/networking/device_drivers/wwan/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+WWAN Device Drivers
+===================
+
+Contents:
+
+.. toctree::
+ :maxdepth: 2
+
+ iosm
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/networking/device_drivers/wwan/iosm.rst b/Documentation/networking/device_drivers/wwan/iosm.rst
new file mode 100644
index 000000000000..aceb0223eb46
--- /dev/null
+++ b/Documentation/networking/device_drivers/wwan/iosm.rst
@@ -0,0 +1,96 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+.. Copyright (C) 2020-21 Intel Corporation
+
+.. _iosm_driver_doc:
+
+===========================================
+IOSM Driver for Intel M.2 PCIe based Modems
+===========================================
+The IOSM (IPC over Shared Memory) driver is a WWAN PCIe host driver developed
+for linux or chrome platform for data exchange over PCIe interface between
+Host platform & Intel M.2 Modem. The driver exposes interface conforming to the
+MBIM protocol [1]. Any front end application ( eg: Modem Manager) could easily
+manage the MBIM interface to enable data communication towards WWAN.
+
+Basic usage
+===========
+MBIM functions are inactive when unmanaged. The IOSM driver only provides a
+userspace interface MBIM "WWAN PORT" representing MBIM control channel and does
+not play any role in managing the functionality. It is the job of a userspace
+application to detect port enumeration and enable MBIM functionality.
+
+Examples of few such userspace application are:
+- mbimcli (included with the libmbim [2] library), and
+- Modem Manager [3]
+
+Management Applications to carry out below required actions for establishing
+MBIM IP session:
+- open the MBIM control channel
+- configure network connection settings
+- connect to network
+- configure IP network interface
+
+Management application development
+==================================
+The driver and userspace interfaces are described below. The MBIM protocol is
+described in [1] Mobile Broadband Interface Model v1.0 Errata-1.
+
+MBIM control channel userspace ABI
+----------------------------------
+
+/dev/wwan0mbim0 character device
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The driver exposes an MBIM interface to the MBIM function by implementing
+MBIM WWAN Port. The userspace end of the control channel pipe is a
+/dev/wwan0mbim0 character device. Application shall use this interface for
+MBIM protocol communication.
+
+Fragmentation
+~~~~~~~~~~~~~
+The userspace application is responsible for all control message fragmentation
+and defragmentation as per MBIM specification.
+
+/dev/wwan0mbim0 write()
+~~~~~~~~~~~~~~~~~~~~~~~
+The MBIM control messages from the management application must not exceed the
+negotiated control message size.
+
+/dev/wwan0mbim0 read()
+~~~~~~~~~~~~~~~~~~~~~~
+The management application must accept control messages of up the negotiated
+control message size.
+
+MBIM data channel userspace ABI
+-------------------------------
+
+wwan0-X network device
+~~~~~~~~~~~~~~~~~~~~~~
+The IOSM driver exposes IP link interface "wwan0-X" of type "wwan" for IP
+traffic. Iproute network utility is used for creating "wwan0-X" network
+interface and for associating it with MBIM IP session. The Driver supports
+upto 8 IP sessions for simultaneous IP communication.
+
+The userspace management application is responsible for creating new IP link
+prior to establishing MBIM IP session where the SessionId is greater than 0.
+
+For example, creating new IP link for a MBIM IP session with SessionId 1:
+
+ ip link add dev wwan0-1 parentdev-name wwan0 type wwan linkid 1
+
+The driver will automatically map the "wwan0-1" network device to MBIM IP
+session 1.
+
+References
+==========
+[1] "MBIM (Mobile Broadband Interface Model) Errata-1"
+ - https://www.usb.org/document-library/
+
+[2] libmbim - "a glib-based library for talking to WWAN modems and
+ devices which speak the Mobile Interface Broadband Model (MBIM)
+ protocol"
+ - http://www.freedesktop.org/wiki/Software/libmbim/
+
+[3] Modem Manager - "a DBus-activated daemon which controls mobile
+ broadband (2G/3G/4G) devices and connections"
+ - http://www.freedesktop.org/wiki/Software/ModemManager/
diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst
index 54c9f107c4b0..4878907e9232 100644
--- a/Documentation/networking/devlink/devlink-params.rst
+++ b/Documentation/networking/devlink/devlink-params.rst
@@ -97,6 +97,18 @@ own name.
* - ``enable_roce``
- Boolean
- Enable handling of RoCE traffic in the device.
+ * - ``enable_eth``
+ - Boolean
+ - When enabled, the device driver will instantiate Ethernet specific
+ auxiliary device of the devlink device.
+ * - ``enable_rdma``
+ - Boolean
+ - When enabled, the device driver will instantiate RDMA specific
+ auxiliary device of the devlink device.
+ * - ``enable_vnet``
+ - Boolean
+ - When enabled, the device driver will instantiate VDPA networking
+ specific auxiliary device of the devlink device.
* - ``internal_err_reset``
- Boolean
- When enabled, the device driver will reset the device on internal
diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
index ab790e7980b8..7627b1da01f2 100644
--- a/Documentation/networking/devlink/devlink-port.rst
+++ b/Documentation/networking/devlink/devlink-port.rst
@@ -164,6 +164,41 @@ device to instantiate the subfunction device on particular PCI function.
A subfunction device is created on the :ref:`Documentation/driver-api/auxiliary_bus.rst <auxiliary_bus>`.
At this point a matching subfunction driver binds to the subfunction's auxiliary device.
+Rate object management
+======================
+
+Devlink provides API to manage tx rates of single devlink port or a group.
+This is done through rate objects, which can be one of the two types:
+
+``leaf``
+ Represents a single devlink port; created/destroyed by the driver. Since leaf
+ have 1to1 mapping to its devlink port, in user space it is referred as
+ ``pci/<bus_addr>/<port_index>``;
+
+``node``
+ Represents a group of rate objects (leafs and/or nodes); created/deleted by
+ request from the userspace; initially empty (no rate objects added). In
+ userspace it is referred as ``pci/<bus_addr>/<node_name>``, where
+ ``node_name`` can be any identifier, except decimal number, to avoid
+ collisions with leafs.
+
+API allows to configure following rate object's parameters:
+
+``tx_share``
+ Minimum TX rate value shared among all other rate objects, or rate objects
+ that parts of the parent group, if it is a part of the same group.
+
+``tx_max``
+ Maximum TX rate value.
+
+``parent``
+ Parent node name. Parent node rate limits are considered as additional limits
+ to all node children limits. ``tx_max`` is an upper limit for children.
+ ``tx_share`` is a total bandwidth distributed among children.
+
+Driver implementations are allowed to support both or either rate object types
+and setting methods of their parameters.
+
Terms and Definitions
=====================
diff --git a/Documentation/networking/devlink/devlink-region.rst b/Documentation/networking/devlink/devlink-region.rst
index 3654c3e9658f..58fe95e9a49d 100644
--- a/Documentation/networking/devlink/devlink-region.rst
+++ b/Documentation/networking/devlink/devlink-region.rst
@@ -22,7 +22,7 @@ The major benefit to creating a region is to provide access to internal
address regions that are otherwise inaccessible to the user.
Regions may also be used to provide an additional way to debug complex error
-states, but see also :doc:`devlink-health`
+states, but see also Documentation/networking/devlink/devlink-health.rst
Regions may optionally support capturing a snapshot on demand via the
``DEVLINK_CMD_REGION_NEW`` netlink message. A driver wishing to allow
diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst
index 935b6397e8cf..90d1381b88de 100644
--- a/Documentation/networking/devlink/devlink-trap.rst
+++ b/Documentation/networking/devlink/devlink-trap.rst
@@ -495,8 +495,9 @@ help debug packet drops caused by these exceptions. The following list includes
links to the description of driver-specific traps registered by various device
drivers:
- * :doc:`netdevsim`
- * :doc:`mlxsw`
+ * Documentation/networking/devlink/netdevsim.rst
+ * Documentation/networking/devlink/mlxsw.rst
+ * Documentation/networking/devlink/prestera.rst
.. _Generic-Packet-Trap-Groups:
diff --git a/Documentation/networking/devlink/hns3.rst b/Documentation/networking/devlink/hns3.rst
new file mode 100644
index 000000000000..4562a6e4782f
--- /dev/null
+++ b/Documentation/networking/devlink/hns3.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+hns3 devlink support
+====================
+
+This document describes the devlink features implemented by the ``hns3``
+device driver.
+
+The ``hns3`` driver supports reloading via ``DEVLINK_CMD_RELOAD``.
+
+Info versions
+=============
+
+The ``hns3`` driver reports the following versions
+
+.. list-table:: devlink info versions implemented
+ :widths: 10 10 80
+
+ * - Name
+ - Type
+ - Description
+ * - ``fw``
+ - running
+ - Used to represent the firmware version.
diff --git a/Documentation/networking/devlink/ice.rst b/Documentation/networking/devlink/ice.rst
index a432dc419fa4..5d97cee9457b 100644
--- a/Documentation/networking/devlink/ice.rst
+++ b/Documentation/networking/devlink/ice.rst
@@ -30,10 +30,11 @@ The ``ice`` driver reports the following versions
PHY, link, etc.
* - ``fw.mgmt.api``
- running
- - 1.5
- - 2-digit version number of the API exported over the AdminQ by the
- management firmware. Used by the driver to identify what commands
- are supported.
+ - 1.5.1
+ - 3-digit version number (major.minor.patch) of the API exported over
+ the AdminQ by the management firmware. Used by the driver to
+ identify what commands are supported. Historical versions of the
+ kernel only displayed a 2-digit version number (major.minor).
* - ``fw.mgmt.build``
- running
- 0x305d955f
diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst
index 8428a1220723..45b5f8b341df 100644
--- a/Documentation/networking/devlink/index.rst
+++ b/Documentation/networking/devlink/index.rst
@@ -34,6 +34,7 @@ parameters, info versions, and other features it supports.
:maxdepth: 1
bnxt
+ hns3
ionic
ice
mlx4
@@ -42,7 +43,7 @@ parameters, info versions, and other features it supports.
mv88e6xxx
netdevsim
nfp
- sja1105
qed
ti-cpsw-switch
am65-nuss-cpsw-switch
+ prestera
diff --git a/Documentation/networking/devlink/netdevsim.rst b/Documentation/networking/devlink/netdevsim.rst
index 02c2d20dc673..8a292fb5aaea 100644
--- a/Documentation/networking/devlink/netdevsim.rst
+++ b/Documentation/networking/devlink/netdevsim.rst
@@ -57,6 +57,32 @@ entries, FIB rule entries and nexthops that the driver will allow.
$ devlink resource set netdevsim/netdevsim0 path /nexthops size 16
$ devlink dev reload netdevsim/netdevsim0
+Rate objects
+============
+
+The ``netdevsim`` driver supports rate objects management, which includes:
+
+- registerging/unregistering leaf rate objects per VF devlink port;
+- creation/deletion node rate objects;
+- setting tx_share and tx_max rate values for any rate object type;
+- setting parent node for any rate object type.
+
+Rate nodes and it's parameters are exposed in ``netdevsim`` debugfs in RO mode.
+For example created rate node with name ``some_group``:
+
+.. code:: shell
+
+ $ ls /sys/kernel/debug/netdevsim/netdevsim0/rate_groups/some_group
+ rate_parent tx_max tx_share
+
+Same parameters are exposed for leaf objects in corresponding ports directories.
+For ex.:
+
+.. code:: shell
+
+ $ ls /sys/kernel/debug/netdevsim/netdevsim0/ports/1
+ dev ethtool rate_parent tx_max tx_share
+
Driver-specific Traps
=====================
diff --git a/Documentation/networking/devlink/prestera.rst b/Documentation/networking/devlink/prestera.rst
new file mode 100644
index 000000000000..49409d1d3081
--- /dev/null
+++ b/Documentation/networking/devlink/prestera.rst
@@ -0,0 +1,141 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+prestera devlink support
+========================
+
+This document describes the devlink features implemented by the ``prestera``
+device driver.
+
+Driver-specific Traps
+=====================
+
+.. list-table:: List of Driver-specific Traps Registered by ``prestera``
+ :widths: 5 5 90
+
+ * - Name
+ - Type
+ - Description
+.. list-table:: List of Driver-specific Traps Registered by ``prestera``
+ :widths: 5 5 90
+
+ * - Name
+ - Type
+ - Description
+ * - ``arp_bc``
+ - ``trap``
+ - Traps ARP broadcast packets (both requests/responses)
+ * - ``is_is``
+ - ``trap``
+ - Traps IS-IS packets
+ * - ``ospf``
+ - ``trap``
+ - Traps OSPF packets
+ * - ``ip_bc_mac``
+ - ``trap``
+ - Traps IPv4 packets with broadcast DA Mac address
+ * - ``stp``
+ - ``trap``
+ - Traps STP BPDU
+ * - ``lacp``
+ - ``trap``
+ - Traps LACP packets
+ * - ``lldp``
+ - ``trap``
+ - Traps LLDP packets
+ * - ``router_mc``
+ - ``trap``
+ - Traps multicast packets
+ * - ``vrrp``
+ - ``trap``
+ - Traps VRRP packets
+ * - ``dhcp``
+ - ``trap``
+ - Traps DHCP packets
+ * - ``mtu_error``
+ - ``trap``
+ - Traps (exception) packets that exceeded port's MTU
+ * - ``mac_to_me``
+ - ``trap``
+ - Traps packets with switch-port's DA Mac address
+ * - ``ttl_error``
+ - ``trap``
+ - Traps (exception) IPv4 packets whose TTL exceeded
+ * - ``ipv4_options``
+ - ``trap``
+ - Traps (exception) packets due to the malformed IPV4 header options
+ * - ``ip_default_route``
+ - ``trap``
+ - Traps packets that have no specific IP interface (IP to me) and no forwarding prefix
+ * - ``local_route``
+ - ``trap``
+ - Traps packets that have been send to one of switch IP interfaces addresses
+ * - ``ipv4_icmp_redirect``
+ - ``trap``
+ - Traps (exception) IPV4 ICMP redirect packets
+ * - ``arp_response``
+ - ``trap``
+ - Traps ARP replies packets that have switch-port's DA Mac address
+ * - ``acl_code_0``
+ - ``trap``
+ - Traps packets that have ACL priority set to 0 (tc pref 0)
+ * - ``acl_code_1``
+ - ``trap``
+ - Traps packets that have ACL priority set to 1 (tc pref 1)
+ * - ``acl_code_2``
+ - ``trap``
+ - Traps packets that have ACL priority set to 2 (tc pref 2)
+ * - ``acl_code_3``
+ - ``trap``
+ - Traps packets that have ACL priority set to 3 (tc pref 3)
+ * - ``acl_code_4``
+ - ``trap``
+ - Traps packets that have ACL priority set to 4 (tc pref 4)
+ * - ``acl_code_5``
+ - ``trap``
+ - Traps packets that have ACL priority set to 5 (tc pref 5)
+ * - ``acl_code_6``
+ - ``trap``
+ - Traps packets that have ACL priority set to 6 (tc pref 6)
+ * - ``acl_code_7``
+ - ``trap``
+ - Traps packets that have ACL priority set to 7 (tc pref 7)
+ * - ``ipv4_bgp``
+ - ``trap``
+ - Traps IPv4 BGP packets
+ * - ``ssh``
+ - ``trap``
+ - Traps SSH packets
+ * - ``telnet``
+ - ``trap``
+ - Traps Telnet packets
+ * - ``icmp``
+ - ``trap``
+ - Traps ICMP packets
+ * - ``rxdma_drop``
+ - ``drop``
+ - Drops packets (RxDMA) due to the lack of ingress buffers etc.
+ * - ``port_no_vlan``
+ - ``drop``
+ - Drops packets due to faulty-configured network or due to internal bug (config issue).
+ * - ``local_port``
+ - ``drop``
+ - Drops packets whose decision (FDB entry) is to bridge packet back to the incoming port/trunk.
+ * - ``invalid_sa``
+ - ``drop``
+ - Drops packets with multicast source MAC address.
+ * - ``illegal_ip_addr``
+ - ``drop``
+ - Drops packets with illegal SIP/DIP multicast/unicast addresses.
+ * - ``illegal_ipv4_hdr``
+ - ``drop``
+ - Drops packets with illegal IPV4 header.
+ * - ``ip_uc_dip_da_mismatch``
+ - ``drop``
+ - Drops packets with destination MAC being unicast, but destination IP address being multicast.
+ * - ``ip_sip_is_zero``
+ - ``drop``
+ - Drops packets with zero (0) IPV4 source address.
+ * - ``met_red``
+ - ``drop``
+ - Drops non-conforming packets (dropped by Ingress policer, metering drop), e.g. packet rate exceeded configured bandwith.
diff --git a/Documentation/networking/devlink/sja1105.rst b/Documentation/networking/devlink/sja1105.rst
deleted file mode 100644
index e2679c274085..000000000000
--- a/Documentation/networking/devlink/sja1105.rst
+++ /dev/null
@@ -1,49 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=======================
-sja1105 devlink support
-=======================
-
-This document describes the devlink features implemented
-by the ``sja1105`` device driver.
-
-Parameters
-==========
-
-.. list-table:: Driver-specific parameters implemented
- :widths: 5 5 5 85
-
- * - Name
- - Type
- - Mode
- - Description
- * - ``best_effort_vlan_filtering``
- - Boolean
- - runtime
- - Allow plain ETH_P_8021Q headers to be used as DSA tags.
-
- Benefits:
-
- - Can terminate untagged traffic over switch net
- devices even when enslaved to a bridge with
- vlan_filtering=1.
- - Can terminate VLAN-tagged traffic over switch net
- devices even when enslaved to a bridge with
- vlan_filtering=1, with some constraints (no more than
- 7 non-pvid VLANs per user port).
- - Can do QoS based on VLAN PCP and VLAN membership
- admission control for autonomously forwarded frames
- (regardless of whether they can be terminated on the
- CPU or not).
-
- Drawbacks:
-
- - User cannot use VLANs in range 1024-3071. If the
- switch receives frames with such VIDs, it will
- misinterpret them as DSA tags.
- - Switch uses Shared VLAN Learning (FDB lookup uses
- only DMAC as key).
- - When VLANs span cross-chip topologies, the total
- number of permitted VLANs may be less than 7 per
- port, due to a maximum number of 32 VLAN retagging
- rules per switch.
diff --git a/Documentation/networking/dsa/configuration.rst b/Documentation/networking/dsa/configuration.rst
index 774f0e76c746..2b08f1a772d3 100644
--- a/Documentation/networking/dsa/configuration.rst
+++ b/Documentation/networking/dsa/configuration.rst
@@ -292,3 +292,71 @@ configuration.
# bring up the bridge devices
ip link set br0 up
+
+Forwarding database (FDB) management
+------------------------------------
+
+The existing DSA switches do not have the necessary hardware support to keep
+the software FDB of the bridge in sync with the hardware tables, so the two
+tables are managed separately (``bridge fdb show`` queries both, and depending
+on whether the ``self`` or ``master`` flags are being used, a ``bridge fdb
+add`` or ``bridge fdb del`` command acts upon entries from one or both tables).
+
+Up until kernel v4.14, DSA only supported user space management of bridge FDB
+entries using the bridge bypass operations (which do not update the software
+FDB, just the hardware one) using the ``self`` flag (which is optional and can
+be omitted).
+
+ .. code-block:: sh
+
+ bridge fdb add dev swp0 00:01:02:03:04:05 self static
+ # or shorthand
+ bridge fdb add dev swp0 00:01:02:03:04:05 static
+
+Due to a bug, the bridge bypass FDB implementation provided by DSA did not
+distinguish between ``static`` and ``local`` FDB entries (``static`` are meant
+to be forwarded, while ``local`` are meant to be locally terminated, i.e. sent
+to the host port). Instead, all FDB entries with the ``self`` flag (implicit or
+explicit) are treated by DSA as ``static`` even if they are ``local``.
+
+ .. code-block:: sh
+
+ # This command:
+ bridge fdb add dev swp0 00:01:02:03:04:05 static
+ # behaves the same for DSA as this command:
+ bridge fdb add dev swp0 00:01:02:03:04:05 local
+ # or shorthand, because the 'local' flag is implicit if 'static' is not
+ # specified, it also behaves the same as:
+ bridge fdb add dev swp0 00:01:02:03:04:05
+
+The last command is an incorrect way of adding a static bridge FDB entry to a
+DSA switch using the bridge bypass operations, and works by mistake. Other
+drivers will treat an FDB entry added by the same command as ``local`` and as
+such, will not forward it, as opposed to DSA.
+
+Between kernel v4.14 and v5.14, DSA has supported in parallel two modes of
+adding a bridge FDB entry to the switch: the bridge bypass discussed above, as
+well as a new mode using the ``master`` flag which installs FDB entries in the
+software bridge too.
+
+ .. code-block:: sh
+
+ bridge fdb add dev swp0 00:01:02:03:04:05 master static
+
+Since kernel v5.14, DSA has gained stronger integration with the bridge's
+software FDB, and the support for its bridge bypass FDB implementation (using
+the ``self`` flag) has been removed. This results in the following changes:
+
+ .. code-block:: sh
+
+ # This is the only valid way of adding an FDB entry that is supported,
+ # compatible with v4.14 kernels and later:
+ bridge fdb add dev swp0 00:01:02:03:04:05 master static
+ # This command is no longer buggy and the entry is properly treated as
+ # 'local' instead of being forwarded:
+ bridge fdb add dev swp0 00:01:02:03:04:05
+ # This command no longer installs a static FDB entry to hardware:
+ bridge fdb add dev swp0 00:01:02:03:04:05 static
+
+Script writers are therefore encouraged to use the ``master static`` set of
+flags when working with bridge FDB entries on DSA switch interfaces.
diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst
index 8688009514cc..89bb4fa4c362 100644
--- a/Documentation/networking/dsa/dsa.rst
+++ b/Documentation/networking/dsa/dsa.rst
@@ -93,14 +93,15 @@ A tagging protocol may tag all packets with switch tags of the same length, or
the tag length might vary (for example packets with PTP timestamps might
require an extended switch tag, or there might be one tag length on TX and a
different one on RX). Either way, the tagging protocol driver must populate the
-``struct dsa_device_ops::overhead`` with the length in octets of the longest
-switch frame header. The DSA framework will automatically adjust the MTU of the
-master interface to accomodate for this extra size in order for DSA user ports
-to support the standard MTU (L2 payload length) of 1500 octets. The ``overhead``
-is also used to request from the network stack, on a best-effort basis, the
-allocation of packets with a ``needed_headroom`` or ``needed_tailroom``
-sufficient such that the act of pushing the switch tag on transmission of a
-packet does not cause it to reallocate due to lack of memory.
+``struct dsa_device_ops::needed_headroom`` and/or ``struct dsa_device_ops::needed_tailroom``
+with the length in octets of the longest switch frame header/trailer. The DSA
+framework will automatically adjust the MTU of the master interface to
+accommodate for this extra size in order for DSA user ports to support the
+standard MTU (L2 payload length) of 1500 octets. The ``needed_headroom`` and
+``needed_tailroom`` properties are also used to request from the network stack,
+on a best-effort basis, the allocation of packets with enough extra space such
+that the act of pushing the switch tag on transmission of a packet does not
+cause it to reallocate due to lack of memory.
Even though applications are not expected to parse DSA-specific frame headers,
the format on the wire of the tagging protocol represents an Application Binary
@@ -169,8 +170,8 @@ The job of this method is to prepare the skb in a way that the switch will
understand what egress port the packet is for (and not deliver it towards other
ports). Typically this is fulfilled by pushing a frame header. Checking for
insufficient size in the skb headroom or tailroom is unnecessary provided that
-the ``overhead`` and ``tail_tag`` properties were filled out properly, because
-DSA ensures there is enough space before calling this method.
+the ``needed_headroom`` and ``needed_tailroom`` properties were filled out
+properly, because DSA ensures there is enough space before calling this method.
The reception of a packet goes through the tagger's ``rcv`` function. The
passed ``struct sk_buff *skb`` has ``skb->data`` pointing at
@@ -199,19 +200,6 @@ receive all frames regardless of the value of the MAC DA. This can be done by
setting the ``promisc_on_master`` property of the ``struct dsa_device_ops``.
Note that this assumes a DSA-unaware master driver, which is the norm.
-Hardware manufacturers are strongly discouraged to do this, but some tagging
-protocols might not provide source port information on RX for all packets, but
-e.g. only for control traffic (link-local PDUs). In this case, by implementing
-the ``filter`` method of ``struct dsa_device_ops``, the tagger might select
-which packets are to be redirected on RX towards the virtual DSA user network
-interfaces, and which are to be left in the DSA master's RX data path.
-
-It might also happen (although silicon vendors are strongly discouraged to
-produce hardware like this) that a tagging protocol splits the switch-specific
-information into a header portion and a tail portion, therefore not falling
-cleanly into any of the above 3 categories. DSA does not support this
-configuration.
-
Master network devices
----------------------
@@ -662,6 +650,22 @@ Bridge layer
CPU port, and flooding towards the CPU port should also be enabled, due to a
lack of an explicit address filtering mechanism in the DSA core.
+- ``port_bridge_tx_fwd_offload``: bridge layer function invoked after
+ ``port_bridge_join`` when a driver sets ``ds->num_fwd_offloading_bridges`` to
+ a non-zero value. Returning success in this function activates the TX
+ forwarding offload bridge feature for this port, which enables the tagging
+ protocol driver to inject data plane packets towards the bridging domain that
+ the port is a part of. Data plane packets are subject to FDB lookup, hardware
+ learning on the CPU port, and do not override the port STP state.
+ Additionally, replication of data plane packets (multicast, flooding) is
+ handled in hardware and the bridge driver will transmit a single skb for each
+ packet that needs replication. The method is provided as a configuration
+ point for drivers that need to configure the hardware for enabling this
+ feature.
+
+- ``port_bridge_tx_fwd_unoffload``: bridge layer function invoken when a driver
+ leaves a bridge port which had the TX forwarding offload feature enabled.
+
Bridge VLAN filtering
---------------------
diff --git a/Documentation/networking/dsa/sja1105.rst b/Documentation/networking/dsa/sja1105.rst
index 7395a33baaf9..29b1bae0cf00 100644
--- a/Documentation/networking/dsa/sja1105.rst
+++ b/Documentation/networking/dsa/sja1105.rst
@@ -5,7 +5,7 @@ NXP SJA1105 switch driver
Overview
========
-The NXP SJA1105 is a family of 6 devices:
+The NXP SJA1105 is a family of 10 SPI-managed automotive switches:
- SJA1105E: First generation, no TTEthernet
- SJA1105T: First generation, TTEthernet
@@ -13,9 +13,11 @@ The NXP SJA1105 is a family of 6 devices:
- SJA1105Q: Second generation, TTEthernet, no SGMII
- SJA1105R: Second generation, no TTEthernet, SGMII
- SJA1105S: Second generation, TTEthernet, SGMII
-
-These are SPI-managed automotive switches, with all ports being gigabit
-capable, and supporting MII/RMII/RGMII and optionally SGMII on one port.
+- SJA1110A: Third generation, TTEthernet, SGMII, integrated 100base-T1 and
+ 100base-TX PHYs
+- SJA1110B: Third generation, TTEthernet, SGMII, 100base-T1, 100base-TX
+- SJA1110C: Third generation, TTEthernet, SGMII, 100base-T1, 100base-TX
+- SJA1110D: Third generation, TTEthernet, SGMII, 100base-T1
Being automotive parts, their configuration interface is geared towards
set-and-forget use, with minimal dynamic interaction at runtime. They
@@ -63,199 +65,6 @@ If that changed setting can be transmitted to the switch through the dynamic
reconfiguration interface, it is; otherwise the switch is reset and
reprogrammed with the updated static configuration.
-Traffic support
-===============
-
-The switches do not have hardware support for DSA tags, except for "slow
-protocols" for switch control as STP and PTP. For these, the switches have two
-programmable filters for link-local destination MACs.
-These are used to trap BPDUs and PTP traffic to the master netdevice, and are
-further used to support STP and 1588 ordinary clock/boundary clock
-functionality. For frames trapped to the CPU, source port and switch ID
-information is encoded by the hardware into the frames.
-
-But by leveraging ``CONFIG_NET_DSA_TAG_8021Q`` (a software-defined DSA tagging
-format based on VLANs), general-purpose traffic termination through the network
-stack can be supported under certain circumstances.
-
-Depending on VLAN awareness state, the following operating modes are possible
-with the switch:
-
-- Mode 1 (VLAN-unaware): a port is in this mode when it is used as a standalone
- net device, or when it is enslaved to a bridge with ``vlan_filtering=0``.
-- Mode 2 (fully VLAN-aware): a port is in this mode when it is enslaved to a
- bridge with ``vlan_filtering=1``. Access to the entire VLAN range is given to
- the user through ``bridge vlan`` commands, but general-purpose (anything
- other than STP, PTP etc) traffic termination is not possible through the
- switch net devices. The other packets can be still by user space processed
- through the DSA master interface (similar to ``DSA_TAG_PROTO_NONE``).
-- Mode 3 (best-effort VLAN-aware): a port is in this mode when enslaved to a
- bridge with ``vlan_filtering=1``, and the devlink property of its parent
- switch named ``best_effort_vlan_filtering`` is set to ``true``. When
- configured like this, the range of usable VIDs is reduced (0 to 1023 and 3072
- to 4094), so is the number of usable VIDs (maximum of 7 non-pvid VLANs per
- port*), and shared VLAN learning is performed (FDB lookup is done only by
- DMAC, not also by VID).
-
-To summarize, in each mode, the following types of traffic are supported over
-the switch net devices:
-
-+-------------+-----------+--------------+------------+
-| | Mode 1 | Mode 2 | Mode 3 |
-+=============+===========+==============+============+
-| Regular | Yes | No | Yes |
-| traffic | | (use master) | |
-+-------------+-----------+--------------+------------+
-| Management | Yes | Yes | Yes |
-| traffic | | | |
-| (BPDU, PTP) | | | |
-+-------------+-----------+--------------+------------+
-
-To configure the switch to operate in Mode 3, the following steps can be
-followed::
-
- ip link add dev br0 type bridge
- # swp2 operates in Mode 1 now
- ip link set dev swp2 master br0
- # swp2 temporarily moves to Mode 2
- ip link set dev br0 type bridge vlan_filtering 1
- [ 61.204770] sja1105 spi0.1: Reset switch and programmed static config. Reason: VLAN filtering
- [ 61.239944] sja1105 spi0.1: Disabled switch tagging
- # swp3 now operates in Mode 3
- devlink dev param set spi/spi0.1 name best_effort_vlan_filtering value true cmode runtime
- [ 64.682927] sja1105 spi0.1: Reset switch and programmed static config. Reason: VLAN filtering
- [ 64.711925] sja1105 spi0.1: Enabled switch tagging
- # Cannot use VLANs in range 1024-3071 while in Mode 3.
- bridge vlan add dev swp2 vid 1025 untagged pvid
- RTNETLINK answers: Operation not permitted
- bridge vlan add dev swp2 vid 100
- bridge vlan add dev swp2 vid 101 untagged
- bridge vlan
- port vlan ids
- swp5 1 PVID Egress Untagged
-
- swp2 1 PVID Egress Untagged
- 100
- 101 Egress Untagged
-
- swp3 1 PVID Egress Untagged
-
- swp4 1 PVID Egress Untagged
-
- br0 1 PVID Egress Untagged
- bridge vlan add dev swp2 vid 102
- bridge vlan add dev swp2 vid 103
- bridge vlan add dev swp2 vid 104
- bridge vlan add dev swp2 vid 105
- bridge vlan add dev swp2 vid 106
- bridge vlan add dev swp2 vid 107
- # Cannot use mode than 7 VLANs per port while in Mode 3.
- [ 3885.216832] sja1105 spi0.1: No more free subvlans
-
-\* "maximum of 7 non-pvid VLANs per port": Decoding VLAN-tagged packets on the
-CPU in mode 3 is possible through VLAN retagging of packets that go from the
-switch to the CPU. In cross-chip topologies, the port that goes to the CPU
-might also go to other switches. In that case, those other switches will see
-only a retagged packet (which only has meaning for the CPU). So if they are
-interested in this VLAN, they need to apply retagging in the reverse direction,
-to recover the original value from it. This consumes extra hardware resources
-for this switch. There is a maximum of 32 entries in the Retagging Table of
-each switch device.
-
-As an example, consider this cross-chip topology::
-
- +-------------------------------------------------+
- | Host SoC |
- | +-------------------------+ |
- | | DSA master for embedded | |
- | | switch (non-sja1105) | |
- | +--------+-------------------------+--------+ |
- | | embedded L2 switch | |
- | | | |
- | | +--------------+ +--------------+ | |
- | | |DSA master for| |DSA master for| | |
- | | | SJA1105 1 | | SJA1105 2 | | |
- +--+---+--------------+-----+--------------+---+--+
-
- +-----------------------+ +-----------------------+
- | SJA1105 switch 1 | | SJA1105 switch 2 |
- +-----+-----+-----+-----+ +-----+-----+-----+-----+
- |sw1p0|sw1p1|sw1p2|sw1p3| |sw2p0|sw2p1|sw2p2|sw2p3|
- +-----+-----+-----+-----+ +-----+-----+-----+-----+
-
-To reach the CPU, SJA1105 switch 1 (spi/spi2.1) uses the same port as is uses
-to reach SJA1105 switch 2 (spi/spi2.2), which would be port 4 (not drawn).
-Similarly for SJA1105 switch 2.
-
-Also consider the following commands, that add VLAN 100 to every sja1105 user
-port::
-
- devlink dev param set spi/spi2.1 name best_effort_vlan_filtering value true cmode runtime
- devlink dev param set spi/spi2.2 name best_effort_vlan_filtering value true cmode runtime
- ip link add dev br0 type bridge
- for port in sw1p0 sw1p1 sw1p2 sw1p3 \
- sw2p0 sw2p1 sw2p2 sw2p3; do
- ip link set dev $port master br0
- done
- ip link set dev br0 type bridge vlan_filtering 1
- for port in sw1p0 sw1p1 sw1p2 sw1p3 \
- sw2p0 sw2p1 sw2p2; do
- bridge vlan add dev $port vid 100
- done
- ip link add link br0 name br0.100 type vlan id 100 && ip link set dev br0.100 up
- ip addr add 192.168.100.3/24 dev br0.100
- bridge vlan add dev br0 vid 100 self
-
- bridge vlan
- port vlan ids
- sw1p0 1 PVID Egress Untagged
- 100
-
- sw1p1 1 PVID Egress Untagged
- 100
-
- sw1p2 1 PVID Egress Untagged
- 100
-
- sw1p3 1 PVID Egress Untagged
- 100
-
- sw2p0 1 PVID Egress Untagged
- 100
-
- sw2p1 1 PVID Egress Untagged
- 100
-
- sw2p2 1 PVID Egress Untagged
- 100
-
- sw2p3 1 PVID Egress Untagged
-
- br0 1 PVID Egress Untagged
- 100
-
-SJA1105 switch 1 consumes 1 retagging entry for each VLAN on each user port
-towards the CPU. It also consumes 1 retagging entry for each non-pvid VLAN that
-it is also interested in, which is configured on any port of any neighbor
-switch.
-
-In this case, SJA1105 switch 1 consumes a total of 11 retagging entries, as
-follows:
-
-- 8 retagging entries for VLANs 1 and 100 installed on its user ports
- (``sw1p0`` - ``sw1p3``)
-- 3 retagging entries for VLAN 100 installed on the user ports of SJA1105
- switch 2 (``sw2p0`` - ``sw2p2``), because it also has ports that are
- interested in it. The VLAN 1 is a pvid on SJA1105 switch 2 and does not need
- reverse retagging.
-
-SJA1105 switch 2 also consumes 11 retagging entries, but organized as follows:
-
-- 7 retagging entries for the bridge VLANs on its user ports (``sw2p0`` -
- ``sw2p3``).
-- 4 retagging entries for VLAN 100 installed on the user ports of SJA1105
- switch 1 (``sw1p0`` - ``sw1p3``).
-
Switching features
==================
@@ -280,33 +89,10 @@ untagged), and therefore this mode is also supported.
Segregating the switch ports in multiple bridges is supported (e.g. 2 + 2), but
all bridges should have the same level of VLAN awareness (either both have
-``vlan_filtering`` 0, or both 1). Also an inevitable limitation of the fact
-that VLAN awareness is global at the switch level is that once a bridge with
-``vlan_filtering`` enslaves at least one switch port, the other un-bridged
-ports are no longer available for standalone traffic termination.
+``vlan_filtering`` 0, or both 1).
Topology and loop detection through STP is supported.
-L2 FDB manipulation (add/delete/dump) is currently possible for the first
-generation devices. Aging time of FDB entries, as well as enabling fully static
-management (no address learning and no flooding of unknown traffic) is not yet
-configurable in the driver.
-
-A special comment about bridging with other netdevices (illustrated with an
-example):
-
-A board has eth0, eth1, swp0@eth1, swp1@eth1, swp2@eth1, swp3@eth1.
-The switch ports (swp0-3) are under br0.
-It is desired that eth0 is turned into another switched port that communicates
-with swp0-3.
-
-If br0 has vlan_filtering 0, then eth0 can simply be added to br0 with the
-intended results.
-If br0 has vlan_filtering 1, then a new br1 interface needs to be created that
-enslaves eth0 and eth1 (the DSA master of the switch ports). This is because in
-this mode, the switch ports beneath br0 are not capable of regular traffic, and
-are only used as a conduit for switchdev operations.
-
Offloads
========
@@ -510,7 +296,7 @@ not available.
Device Tree bindings and board design
=====================================
-This section references ``Documentation/devicetree/bindings/net/dsa/sja1105.txt``
+This section references ``Documentation/devicetree/bindings/net/dsa/nxp,sja1105.yaml``
and aims to showcase some potential switch caveats.
RMII PHY role and out-of-band signaling
@@ -579,3 +365,54 @@ A board would need to hook up the PHYs connected to the switch to any other
MDIO bus available to Linux within the system (e.g. to the DSA master's MDIO
bus). Link state management then works by the driver manually keeping in sync
(over SPI commands) the MAC link speed with the settings negotiated by the PHY.
+
+By comparison, the SJA1110 supports an MDIO slave access point over which its
+internal 100base-T1 PHYs can be accessed from the host. This is, however, not
+used by the driver, instead the internal 100base-T1 and 100base-TX PHYs are
+accessed through SPI commands, modeled in Linux as virtual MDIO buses.
+
+The microcontroller attached to the SJA1110 port 0 also has an MDIO controller
+operating in master mode, however the driver does not support this either,
+since the microcontroller gets disabled when the Linux driver operates.
+Discrete PHYs connected to the switch ports should have their MDIO interface
+attached to an MDIO controller from the host system and not to the switch,
+similar to SJA1105.
+
+Port compatibility matrix
+-------------------------
+
+The SJA1105 port compatibility matrix is:
+
+===== ============== ============== ==============
+Port SJA1105E/T SJA1105P/Q SJA1105R/S
+===== ============== ============== ==============
+0 xMII xMII xMII
+1 xMII xMII xMII
+2 xMII xMII xMII
+3 xMII xMII xMII
+4 xMII xMII SGMII
+===== ============== ============== ==============
+
+
+The SJA1110 port compatibility matrix is:
+
+===== ============== ============== ============== ==============
+Port SJA1110A SJA1110B SJA1110C SJA1110D
+===== ============== ============== ============== ==============
+0 RevMII (uC) RevMII (uC) RevMII (uC) RevMII (uC)
+1 100base-TX 100base-TX 100base-TX
+ or SGMII SGMII
+2 xMII xMII xMII xMII
+ or SGMII or SGMII
+3 xMII xMII xMII
+ or SGMII or SGMII SGMII
+ or 2500base-X or 2500base-X or 2500base-X
+4 SGMII SGMII SGMII SGMII
+ or 2500base-X or 2500base-X or 2500base-X or 2500base-X
+5 100base-T1 100base-T1 100base-T1 100base-T1
+6 100base-T1 100base-T1 100base-T1 100base-T1
+7 100base-T1 100base-T1 100base-T1 100base-T1
+8 100base-T1 100base-T1 n/a n/a
+9 100base-T1 100base-T1 n/a n/a
+10 100base-T1 n/a n/a n/a
+===== ============== ============== ============== ==============
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 25131df3c2bd..d9b55b7a1a4d 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -212,6 +212,7 @@ Userspace to kernel:
``ETHTOOL_MSG_FEC_SET`` set FEC settings
``ETHTOOL_MSG_MODULE_EEPROM_GET`` read SFP module EEPROM
``ETHTOOL_MSG_STATS_GET`` get standard statistics
+ ``ETHTOOL_MSG_PHC_VCLOCKS_GET`` get PHC virtual clocks info
===================================== ================================
Kernel to userspace:
@@ -250,6 +251,7 @@ Kernel to userspace:
``ETHTOOL_MSG_FEC_NTF`` FEC settings
``ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY`` read SFP module EEPROM
``ETHTOOL_MSG_STATS_GET_REPLY`` standard statistics
+ ``ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY`` PHC virtual clocks info
======================================== =================================
``GET`` requests are sent by userspace applications to retrieve device
@@ -593,6 +595,14 @@ Link extended substates:
that is not formally
supported, which led to
signal integrity issues
+
+ ``ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST`` The external clock signal for
+ SerDes is too weak or
+ unavailable.
+
+ ``ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS`` The received signal for
+ SerDes is too weak because
+ analog loss of signal.
================================================================= =============================
Cable issue substates:
@@ -937,12 +947,25 @@ Kernel response contents:
``ETHTOOL_A_COALESCE_TX_USECS_HIGH`` u32 delay (us), high Tx
``ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH`` u32 max packets, high Tx
``ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL`` u32 rate sampling interval
+ ``ETHTOOL_A_COALESCE_USE_CQE_TX`` bool timer reset mode, Tx
+ ``ETHTOOL_A_COALESCE_USE_CQE_RX`` bool timer reset mode, Rx
=========================================== ====== =======================
Attributes are only included in reply if their value is not zero or the
corresponding bit in ``ethtool_ops::supported_coalesce_params`` is set (i.e.
they are declared as supported by driver).
+Timer reset mode (``ETHTOOL_A_COALESCE_USE_CQE_TX`` and
+``ETHTOOL_A_COALESCE_USE_CQE_RX``) controls the interaction between packet
+arrival and the various time based delay parameters. By default timers are
+expected to limit the max delay between any packet arrival/departure and a
+corresponding interrupt. In this mode timer should be started by packet
+arrival (sometimes delivery of previous interrupt) and reset when interrupt
+is delivered.
+Setting the appropriate attribute to 1 will enable ``CQE`` mode, where
+each packet event resets the timer. In this mode timer is used to force
+the interrupt if queue goes idle, while busy queues depend on the packet
+limit to trigger interrupts.
COALESCE_SET
============
@@ -975,6 +998,8 @@ Request contents:
``ETHTOOL_A_COALESCE_TX_USECS_HIGH`` u32 delay (us), high Tx
``ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH`` u32 max packets, high Tx
``ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL`` u32 rate sampling interval
+ ``ETHTOOL_A_COALESCE_USE_CQE_TX`` bool timer reset mode, Tx
+ ``ETHTOOL_A_COALESCE_USE_CQE_RX`` bool timer reset mode, Rx
=========================================== ====== =======================
Request is rejected if it attributes declared as unsupported by driver (i.e.
@@ -1363,8 +1388,8 @@ in an implementation specific way.
``ETHTOOL_A_FEC_AUTO`` requests the driver to choose FEC mode based on SFP
module parameters. This does not mean autonegotiation.
-MODULE_EEPROM
-=============
+MODULE_EEPROM_GET
+=================
Fetch module EEPROM data dump.
This interface is designed to allow dumps of at most 1/2 page at once. This
@@ -1383,12 +1408,14 @@ Request contents:
``ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS`` u8 page I2C address
======================================= ====== ==========================
+If ``ETHTOOL_A_MODULE_EEPROM_BANK`` is not specified, bank 0 is assumed.
+
Kernel response contents:
+---------------------------------------------+--------+---------------------+
| ``ETHTOOL_A_MODULE_EEPROM_HEADER`` | nested | reply header |
+---------------------------------------------+--------+---------------------+
- | ``ETHTOOL_A_MODULE_EEPROM_DATA`` | nested | array of bytes from |
+ | ``ETHTOOL_A_MODULE_EEPROM_DATA`` | binary | array of bytes from |
| | | module EEPROM |
+---------------------------------------------+--------+---------------------+
@@ -1475,6 +1502,25 @@ Low and high bounds are inclusive, for example:
etherStatsPkts512to1023Octets 512 1023
============================= ==== ====
+PHC_VCLOCKS_GET
+===============
+
+Query device PHC virtual clocks information.
+
+Request contents:
+
+ ==================================== ====== ==========================
+ ``ETHTOOL_A_PHC_VCLOCKS_HEADER`` nested request header
+ ==================================== ====== ==========================
+
+Kernel response contents:
+
+ ==================================== ====== ==========================
+ ``ETHTOOL_A_PHC_VCLOCKS_HEADER`` nested reply header
+ ``ETHTOOL_A_PHC_VCLOCKS_NUM`` u32 PHC virtual clocks number
+ ``ETHTOOL_A_PHC_VCLOCKS_INDEX`` s32 PHC index array
+ ==================================== ====== ==========================
+
Request translation
===================
@@ -1573,4 +1619,5 @@ are netlink only.
n/a ``ETHTOOL_MSG_CABLE_TEST_ACT``
n/a ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT``
n/a ``ETHTOOL_MSG_TUNNEL_INFO_GET``
+ n/a ``ETHTOOL_MSG_PHC_VCLOCKS_GET``
=================================== =====================================
diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst
index 3e2221f4abe4..ce2b8e8bb9ab 100644
--- a/Documentation/networking/filter.rst
+++ b/Documentation/networking/filter.rst
@@ -320,13 +320,6 @@ Examples for low-level BPF:
ret #-1
drop: ret #0
-**(Accelerated) VLAN w/ id 10**::
-
- ld vlan_tci
- jneq #10, drop
- ret #-1
- drop: ret #0
-
**icmp random packet sampling, 1 in 4**::
ldh [12]
@@ -358,6 +351,22 @@ Examples for low-level BPF:
bad: ret #0 /* SECCOMP_RET_KILL_THREAD */
good: ret #0x7fff0000 /* SECCOMP_RET_ALLOW */
+Examples for low-level BPF extension:
+
+**Packet for interface index 13**::
+
+ ld ifidx
+ jneq #13, drop
+ ret #-1
+ drop: ret #0
+
+**(Accelerated) VLAN w/ id 10**::
+
+ ld vlan_tci
+ jneq #10, drop
+ ret #-1
+ drop: ret #0
+
The above example code can be placed into a file (here called "foo"), and
then be passed to the bpf_asm tool for generating opcodes, output that xt_bpf
and cls_bpf understands and can directly be loaded with. Example with above
@@ -629,8 +638,8 @@ extension, PTP dissector/classifier, and much more. They are all internally
converted by the kernel into the new instruction set representation and run
in the eBPF interpreter. For in-kernel handlers, this all works transparently
by using bpf_prog_create() for setting up the filter, resp.
-bpf_prog_destroy() for destroying it. The macro
-BPF_PROG_RUN(filter, ctx) transparently invokes eBPF interpreter or JITed
+bpf_prog_destroy() for destroying it. The function
+bpf_prog_run(filter, ctx) transparently invokes eBPF interpreter or JITed
code to run the filter. 'filter' is a pointer to struct bpf_prog that we
got from bpf_prog_create(), and 'ctx' the given context (e.g.
skb pointer). All constraints and restrictions from bpf_check_classic() apply
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index e9ce55992aa9..58bc8cd367c6 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -57,6 +57,7 @@ Contents:
gen_stats
gtp
ila
+ ioam6-sysctl
ipddp
ip_dynaddr
ipsec
@@ -68,6 +69,7 @@ Contents:
l2tp
lapb-module
mac80211-injection
+ mctp
mpls-sysctl
mptcp-sysctl
multiqueue
diff --git a/Documentation/networking/ioam6-sysctl.rst b/Documentation/networking/ioam6-sysctl.rst
new file mode 100644
index 000000000000..c18cab2c481a
--- /dev/null
+++ b/Documentation/networking/ioam6-sysctl.rst
@@ -0,0 +1,26 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================
+IOAM6 Sysfs variables
+=====================
+
+
+/proc/sys/net/conf/<iface>/ioam6_* variables:
+=============================================
+
+ioam6_enabled - BOOL
+ Accept (= enabled) or ignore (= disabled) IPv6 IOAM options on ingress
+ for this interface.
+
+ * 0 - disabled (default)
+ * 1 - enabled
+
+ioam6_id - SHORT INTEGER
+ Define the IOAM id of this interface.
+
+ Default is ~0.
+
+ioam6_id_wide - INTEGER
+ Define the wide IOAM id of this interface.
+
+ Default is ~0.
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index c2ecc9894fd0..d91ab28718d4 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -99,6 +99,35 @@ fib_multipath_hash_policy - INTEGER
- 0 - Layer 3
- 1 - Layer 4
- 2 - Layer 3 or inner Layer 3 if present
+ - 3 - Custom multipath hash. Fields used for multipath hash calculation
+ are determined by fib_multipath_hash_fields sysctl
+
+fib_multipath_hash_fields - UNSIGNED INTEGER
+ When fib_multipath_hash_policy is set to 3 (custom multipath hash), the
+ fields used for multipath hash calculation are determined by this
+ sysctl.
+
+ This value is a bitmask which enables various fields for multipath hash
+ calculation.
+
+ Possible fields are:
+
+ ====== ============================
+ 0x0001 Source IP address
+ 0x0002 Destination IP address
+ 0x0004 IP protocol
+ 0x0008 Unused (Flow Label)
+ 0x0010 Source port
+ 0x0020 Destination port
+ 0x0040 Inner source IP address
+ 0x0080 Inner destination IP address
+ 0x0100 Inner IP protocol
+ 0x0200 Inner Flow Label
+ 0x0400 Inner source port
+ 0x0800 Inner destination port
+ ====== ============================
+
+ Default: 0x0007 (source IP, destination IP and IP protocol)
fib_sync_mem - UNSIGNED INTEGER
Amount of dirty memory from fib entries that can be backlogged before
@@ -732,6 +761,31 @@ tcp_syncookies - INTEGER
network connections you can set this knob to 2 to enable
unconditionally generation of syncookies.
+tcp_migrate_req - BOOLEAN
+ The incoming connection is tied to a specific listening socket when
+ the initial SYN packet is received during the three-way handshake.
+ When a listener is closed, in-flight request sockets during the
+ handshake and established sockets in the accept queue are aborted.
+
+ If the listener has SO_REUSEPORT enabled, other listeners on the
+ same port should have been able to accept such connections. This
+ option makes it possible to migrate such child sockets to another
+ listener after close() or shutdown().
+
+ The BPF_SK_REUSEPORT_SELECT_OR_MIGRATE type of eBPF program should
+ usually be used to define the policy to pick an alive listener.
+ Otherwise, the kernel will randomly pick an alive listener only if
+ this option is enabled.
+
+ Note that migration between listeners with different settings may
+ crash applications. Let's say migration happens from listener A to
+ B, and only B has TCP_SAVE_SYN enabled. B cannot read SYN data from
+ the requests migrated from A. To avoid such a situation, cancel
+ migration by returning SK_DROP in the type of eBPF program, or
+ disable this option.
+
+ Default: 0
+
tcp_fastopen - INTEGER
Enable TCP Fast Open (RFC7413) to send and accept data in the opening
SYN packet.
@@ -772,7 +826,7 @@ tcp_fastopen_blackhole_timeout_sec - INTEGER
initial value when the blackhole issue goes away.
0 to disable the blackhole detection.
- By default, it is set to 1hr.
+ By default, it is set to 0 (feature is disabled).
tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs
The list consists of a primary key and an optional backup key. The
@@ -1743,6 +1797,35 @@ fib_multipath_hash_policy - INTEGER
- 0 - Layer 3 (source and destination addresses plus flow label)
- 1 - Layer 4 (standard 5-tuple)
- 2 - Layer 3 or inner Layer 3 if present
+ - 3 - Custom multipath hash. Fields used for multipath hash calculation
+ are determined by fib_multipath_hash_fields sysctl
+
+fib_multipath_hash_fields - UNSIGNED INTEGER
+ When fib_multipath_hash_policy is set to 3 (custom multipath hash), the
+ fields used for multipath hash calculation are determined by this
+ sysctl.
+
+ This value is a bitmask which enables various fields for multipath hash
+ calculation.
+
+ Possible fields are:
+
+ ====== ============================
+ 0x0001 Source IP address
+ 0x0002 Destination IP address
+ 0x0004 IP protocol
+ 0x0008 Flow Label
+ 0x0010 Source port
+ 0x0020 Destination port
+ 0x0040 Inner source IP address
+ 0x0080 Inner destination IP address
+ 0x0100 Inner IP protocol
+ 0x0200 Inner Flow Label
+ 0x0400 Inner source port
+ 0x0800 Inner destination port
+ ====== ============================
+
+ Default: 0x0007 (source IP, destination IP and IP protocol)
anycast_src_echo_reply - BOOLEAN
Controls the use of anycast addresses as source addresses for ICMPv6
@@ -1843,6 +1926,23 @@ fib_notify_on_flag_change - INTEGER
- 1 - Emit notifications.
- 2 - Emit notifications only for RTM_F_OFFLOAD_FAILED flag change.
+ioam6_id - INTEGER
+ Define the IOAM id of this node. Uses only 24 bits out of 32 in total.
+
+ Min: 0
+ Max: 0xFFFFFF
+
+ Default: 0xFFFFFF
+
+ioam6_id_wide - LONG INTEGER
+ Define the wide IOAM id of this node. Uses only 56 bits out of 64 in
+ total. Can be different from ioam6_id.
+
+ Min: 0
+ Max: 0xFFFFFFFFFFFFFF
+
+ Default: 0xFFFFFFFFFFFFFF
+
IPv6 Fragmentation:
ip6frag_high_thresh - INTEGER
@@ -2751,6 +2851,18 @@ encap_port - INTEGER
Default: 0
+plpmtud_probe_interval - INTEGER
+ The time interval (in milliseconds) for the PLPMTUD probe timer,
+ which is configured to expire after this period to receive an
+ acknowledgment to a probe packet. This is also the time interval
+ between the probes for the current pmtu when the probe search
+ is done.
+
+ PLPMTUD will be disabled when 0 is set, and other values for it
+ must be >= 5000.
+
+ Default: 0
+
``/proc/sys/net/core/*``
========================
diff --git a/Documentation/networking/mctp.rst b/Documentation/networking/mctp.rst
new file mode 100644
index 000000000000..fa7730dbf7b9
--- /dev/null
+++ b/Documentation/networking/mctp.rst
@@ -0,0 +1,213 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================================
+Management Component Transport Protocol (MCTP)
+==============================================
+
+net/mctp/ contains protocol support for MCTP, as defined by DMTF standard
+DSP0236. Physical interface drivers ("bindings" in the specification) are
+provided in drivers/net/mctp/.
+
+The core code provides a socket-based interface to send and receive MCTP
+messages, through an AF_MCTP, SOCK_DGRAM socket.
+
+Structure: interfaces & networks
+================================
+
+The kernel models the local MCTP topology through two items: interfaces and
+networks.
+
+An interface (or "link") is an instance of an MCTP physical transport binding
+(as defined by DSP0236, section 3.2.47), likely connected to a specific hardware
+device. This is represented as a ``struct netdevice``.
+
+A network defines a unique address space for MCTP endpoints by endpoint-ID
+(described by DSP0236, section 3.2.31). A network has a user-visible identifier
+to allow references from userspace. Route definitions are specific to one
+network.
+
+Interfaces are associated with one network. A network may be associated with one
+or more interfaces.
+
+If multiple networks are present, each may contain endpoint IDs (EIDs) that are
+also present on other networks.
+
+Sockets API
+===========
+
+Protocol definitions
+--------------------
+
+MCTP uses ``AF_MCTP`` / ``PF_MCTP`` for the address- and protocol- families.
+Since MCTP is message-based, only ``SOCK_DGRAM`` sockets are supported.
+
+.. code-block:: C
+
+ int sd = socket(AF_MCTP, SOCK_DGRAM, 0);
+
+The only (current) value for the ``protocol`` argument is 0.
+
+As with all socket address families, source and destination addresses are
+specified with a ``sockaddr`` type, with a single-byte endpoint address:
+
+.. code-block:: C
+
+ typedef __u8 mctp_eid_t;
+
+ struct mctp_addr {
+ mctp_eid_t s_addr;
+ };
+
+ struct sockaddr_mctp {
+ __kernel_sa_family_t smctp_family;
+ unsigned int smctp_network;
+ struct mctp_addr smctp_addr;
+ __u8 smctp_type;
+ __u8 smctp_tag;
+ };
+
+ #define MCTP_NET_ANY 0x0
+ #define MCTP_ADDR_ANY 0xff
+
+
+Syscall behaviour
+-----------------
+
+The following sections describe the MCTP-specific behaviours of the standard
+socket system calls. These behaviours have been chosen to map closely to the
+existing sockets APIs.
+
+``bind()`` : set local socket address
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Sockets that receive incoming request packets will bind to a local address,
+using the ``bind()`` syscall.
+
+.. code-block:: C
+
+ struct sockaddr_mctp addr;
+
+ addr.smctp_family = AF_MCTP;
+ addr.smctp_network = MCTP_NET_ANY;
+ addr.smctp_addr.s_addr = MCTP_ADDR_ANY;
+ addr.smctp_type = MCTP_TYPE_PLDM;
+ addr.smctp_tag = MCTP_TAG_OWNER;
+
+ int rc = bind(sd, (struct sockaddr *)&addr, sizeof(addr));
+
+This establishes the local address of the socket. Incoming MCTP messages that
+match the network, address, and message type will be received by this socket.
+The reference to 'incoming' is important here; a bound socket will only receive
+messages with the TO bit set, to indicate an incoming request message, rather
+than a response.
+
+The ``smctp_tag`` value will configure the tags accepted from the remote side of
+this socket. Given the above, the only valid value is ``MCTP_TAG_OWNER``, which
+will result in remotely "owned" tags being routed to this socket. Since
+``MCTP_TAG_OWNER`` is set, the 3 least-significant bits of ``smctp_tag`` are not
+used; callers must set them to zero.
+
+A ``smctp_network`` value of ``MCTP_NET_ANY`` will configure the socket to
+receive incoming packets from any locally-connected network. A specific network
+value will cause the socket to only receive incoming messages from that network.
+
+The ``smctp_addr`` field specifies a local address to bind to. A value of
+``MCTP_ADDR_ANY`` configures the socket to receive messages addressed to any
+local destination EID.
+
+The ``smctp_type`` field specifies which message types to receive. Only the
+lower 7 bits of the type is matched on incoming messages (ie., the
+most-significant IC bit is not part of the match). This results in the socket
+receiving packets with and without a message integrity check footer.
+
+``sendto()``, ``sendmsg()``, ``send()`` : transmit an MCTP message
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+An MCTP message is transmitted using one of the ``sendto()``, ``sendmsg()`` or
+``send()`` syscalls. Using ``sendto()`` as the primary example:
+
+.. code-block:: C
+
+ struct sockaddr_mctp addr;
+ char buf[14];
+ ssize_t len;
+
+ /* set message destination */
+ addr.smctp_family = AF_MCTP;
+ addr.smctp_network = 0;
+ addr.smctp_addr.s_addr = 8;
+ addr.smctp_tag = MCTP_TAG_OWNER;
+ addr.smctp_type = MCTP_TYPE_ECHO;
+
+ /* arbitrary message to send, with message-type header */
+ buf[0] = MCTP_TYPE_ECHO;
+ memcpy(buf + 1, "hello, world!", sizeof(buf) - 1);
+
+ len = sendto(sd, buf, sizeof(buf), 0,
+ (struct sockaddr_mctp *)&addr, sizeof(addr));
+
+The network and address fields of ``addr`` define the remote address to send to.
+If ``smctp_tag`` has the ``MCTP_TAG_OWNER``, the kernel will ignore any bits set
+in ``MCTP_TAG_VALUE``, and generate a tag value suitable for the destination
+EID. If ``MCTP_TAG_OWNER`` is not set, the message will be sent with the tag
+value as specified. If a tag value cannot be allocated, the system call will
+report an errno of ``EAGAIN``.
+
+The application must provide the message type byte as the first byte of the
+message buffer passed to ``sendto()``. If a message integrity check is to be
+included in the transmitted message, it must also be provided in the message
+buffer, and the most-significant bit of the message type byte must be 1.
+
+The ``sendmsg()`` system call allows a more compact argument interface, and the
+message buffer to be specified as a scatter-gather list. At present no ancillary
+message types (used for the ``msg_control`` data passed to ``sendmsg()``) are
+defined.
+
+Transmitting a message on an unconnected socket with ``MCTP_TAG_OWNER``
+specified will cause an allocation of a tag, if no valid tag is already
+allocated for that destination. The (destination-eid,tag) tuple acts as an
+implicit local socket address, to allow the socket to receive responses to this
+outgoing message. If any previous allocation has been performed (to for a
+different remote EID), that allocation is lost.
+
+Sockets will only receive responses to requests they have sent (with TO=1) and
+may only respond (with TO=0) to requests they have received.
+
+``recvfrom()``, ``recvmsg()``, ``recv()`` : receive an MCTP message
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+An MCTP message can be received by an application using one of the
+``recvfrom()``, ``recvmsg()``, or ``recv()`` system calls. Using ``recvfrom()``
+as the primary example:
+
+.. code-block:: C
+
+ struct sockaddr_mctp addr;
+ socklen_t addrlen;
+ char buf[14];
+ ssize_t len;
+
+ addrlen = sizeof(addr);
+
+ len = recvfrom(sd, buf, sizeof(buf), 0,
+ (struct sockaddr_mctp *)&addr, &addrlen);
+
+ /* We can expect addr to describe an MCTP address */
+ assert(addrlen >= sizeof(buf));
+ assert(addr.smctp_family == AF_MCTP);
+
+ printf("received %zd bytes from remote EID %d\n", rc, addr.smctp_addr);
+
+The address argument to ``recvfrom`` and ``recvmsg`` is populated with the
+remote address of the incoming message, including tag value (this will be needed
+in order to reply to the message).
+
+The first byte of the message buffer will contain the message type byte. If an
+integrity check follows the message, it will be included in the received buffer.
+
+The ``recv()`` system call behaves in a similar way, but does not provide a
+remote address to the application. Therefore, these are only useful if the
+remote address is already known, or the message does not require a reply.
+
+Like the send calls, sockets will only receive responses to requests they have
+sent (TO=1) and may only respond (TO=0) to requests they have received.
diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst
index 6af0196c4297..b0d4da71e68e 100644
--- a/Documentation/networking/mptcp-sysctl.rst
+++ b/Documentation/networking/mptcp-sysctl.rst
@@ -7,13 +7,13 @@ MPTCP Sysfs variables
/proc/sys/net/mptcp/* Variables
===============================
-enabled - INTEGER
+enabled - BOOLEAN
Control whether MPTCP sockets can be created.
- MPTCP sockets can be created if the value is nonzero. This is
- a per-namespace sysctl.
+ MPTCP sockets can be created if the value is 1. This is a
+ per-namespace sysctl.
- Default: 1
+ Default: 1 (enabled)
add_addr_timeout - INTEGER (seconds)
Set the timeout after which an ADD_ADDR control message will be
@@ -24,3 +24,36 @@ add_addr_timeout - INTEGER (seconds)
sysctl.
Default: 120
+
+checksum_enabled - BOOLEAN
+ Control whether DSS checksum can be enabled.
+
+ DSS checksum can be enabled if the value is nonzero. This is a
+ per-namespace sysctl.
+
+ Default: 0
+
+allow_join_initial_addr_port - BOOLEAN
+ Allow peers to send join requests to the IP address and port number used
+ by the initial subflow if the value is 1. This controls a flag that is
+ sent to the peer at connection time, and whether such join requests are
+ accepted or denied.
+
+ Joins to addresses advertised with ADD_ADDR are not affected by this
+ value.
+
+ This is a per-namespace sysctl.
+
+ Default: 1
+
+stale_loss_cnt - INTEGER
+ The number of MPTCP-level retransmission intervals with no traffic and
+ pending outstanding data on a given subflow required to declare it stale.
+ The packet scheduler ignores stale subflows.
+ A low stale_loss_cnt value allows for fast active-backup switch-over,
+ an high value maximize links utilization on edge scenarios e.g. lossy
+ link with high BER or peer pausing the data processing.
+
+ This is a per-namespace sysctl.
+
+ Default: 4
diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst
index 91b2cf712801..e26532f49760 100644
--- a/Documentation/networking/netdev-FAQ.rst
+++ b/Documentation/networking/netdev-FAQ.rst
@@ -228,6 +228,23 @@ before posting to the mailing list. The patchwork build bot instance
gets overloaded very easily and netdev@vger really doesn't need more
traffic if we can help it.
+netdevsim is great, can I extend it for my out-of-tree tests?
+-------------------------------------------------------------
+
+No, `netdevsim` is a test vehicle solely for upstream tests.
+(Please add your tests under tools/testing/selftests/.)
+
+We also give no guarantees that `netdevsim` won't change in the future
+in a way which would break what would normally be considered uAPI.
+
+Is netdevsim considered a "user" of an API?
+-------------------------------------------
+
+Linux kernel has a long standing rule that no API should be added unless
+it has a real, in-tree user. Mock-ups and tests based on `netdevsim` are
+strongly encouraged when adding new APIs, but `netdevsim` in itself
+is **not** considered a use case/user.
+
Any other tips to help ensure my net/net-next patch gets OK'd?
--------------------------------------------------------------
Attention to detail. Re-read your own work as if you were the
diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index 17bdcb746dcf..9e4cccb90b87 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -222,6 +222,35 @@ ndo_do_ioctl:
Synchronization: rtnl_lock() semaphore.
Context: process
+ This is only called by network subsystems internally,
+ not by user space calling ioctl as it was in before
+ linux-5.14.
+
+ndo_siocbond:
+ Synchronization: rtnl_lock() semaphore.
+ Context: process
+
+ Used by the bonding driver for the SIOCBOND family of
+ ioctl commands.
+
+ndo_siocwandev:
+ Synchronization: rtnl_lock() semaphore.
+ Context: process
+
+ Used by the drivers/net/wan framework to handle
+ the SIOCWANDEV ioctl with the if_settings structure.
+
+ndo_siocdevprivate:
+ Synchronization: rtnl_lock() semaphore.
+ Context: process
+
+ This is used to implement SIOCDEVPRIVATE ioctl helpers.
+ These should not be added to new drivers, so don't use.
+
+ndo_eth_ioctl:
+ Synchronization: rtnl_lock() semaphore.
+ Context: process
+
ndo_get_stats:
Synchronization: rtnl_lock() semaphore, dev_base_lock rwlock, or RCU.
Context: atomic (can't sleep under rwlock or RCU)
diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 11a9b76786cb..311128abb768 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -17,9 +17,8 @@ nf_conntrack_acct - BOOLEAN
nf_conntrack_buckets - INTEGER
Size of hash table. If not specified as parameter during module
loading, the default size is calculated by dividing total memory
- by 16384 to determine the number of buckets but the hash table will
- never have fewer than 32 and limited to 16384 buckets. For systems
- with more than 4GB of memory it will be 65536 buckets.
+ by 16384 to determine the number of buckets. The hash table will
+ never have fewer than 1024 and never more than 262144 buckets.
This sysctl is only writeable in the initial net namespace.
nf_conntrack_checksum - BOOLEAN
@@ -100,8 +99,12 @@ nf_conntrack_log_invalid - INTEGER
Log invalid packets of a type specified by value.
nf_conntrack_max - INTEGER
- Size of connection tracking table. Default value is
- nf_conntrack_buckets value * 4.
+ Maximum number of allowed connection tracking entries. This value is set
+ to nf_conntrack_buckets by default.
+ Note that connection tracking entries are added to the table twice -- once
+ for the original direction and once for the reply direction (i.e., with
+ the reversed address). This means that with default settings a maxed-out
+ table will have a average hash chain length of 2, not 1.
nf_conntrack_tcp_be_liberal - BOOLEAN
- 0 - disabled (default)
@@ -110,6 +113,12 @@ nf_conntrack_tcp_be_liberal - BOOLEAN
Be conservative in what you do, be liberal in what you accept from others.
If it's non-zero, we mark only out of window RST segments as INVALID.
+nf_conntrack_tcp_ignore_invalid_rst - BOOLEAN
+ - 0 - disabled (default)
+ - 1 - enabled
+
+ If it's 1, we don't mark out of window RST segments as INVALID.
+
nf_conntrack_tcp_loose - BOOLEAN
- 0 - disabled
- not 0 - enabled (default)
@@ -177,3 +186,24 @@ nf_conntrack_gre_timeout_stream - INTEGER (seconds)
This extended timeout will be used in case there is an GRE stream
detected.
+
+nf_hooks_lwtunnel - BOOLEAN
+ - 0 - disabled (default)
+ - not 0 - enabled
+
+ If this option is enabled, the lightweight tunnel netfilter hooks are
+ enabled. This option cannot be disabled once it is enabled.
+
+nf_flowtable_tcp_timeout - INTEGER (seconds)
+ default 30
+
+ Control offload timeout for tcp connections.
+ TCP connections may be offloaded from nf conntrack to nf flow table.
+ Once aged, the connection is returned to nf conntrack with tcp pickup timeout.
+
+nf_flowtable_udp_timeout - INTEGER (seconds)
+ default 30
+
+ Control offload timeout for udp connections.
+ UDP connections may be offloaded from nf conntrack to nf flow table.
+ Once aged, the connection is returned to nf conntrack with udp pickup timeout.
diff --git a/Documentation/networking/operstates.rst b/Documentation/networking/operstates.rst
index 9c918f7cb0e8..1ee2141e8ef1 100644
--- a/Documentation/networking/operstates.rst
+++ b/Documentation/networking/operstates.rst
@@ -73,7 +73,9 @@ IF_OPER_LOWERLAYERDOWN (3):
state (f.e. VLAN).
IF_OPER_TESTING (4):
- Unused in current kernel.
+ Interface is in testing mode, for example executing driver self-tests
+ or media (cable) test. It can't be used for normal traffic until tests
+ complete.
IF_OPER_DORMANT (5):
Interface is L1 up, but waiting for an external event, f.e. for a
@@ -111,7 +113,7 @@ it as lower layer.
Note that for certain kind of soft-devices, which are not managing any
real hardware, it is possible to set this bit from userspace. One
-should use TVL IFLA_CARRIER to do so.
+should use TLV IFLA_CARRIER to do so.
netif_carrier_ok() can be used to query that bit.
diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst
index 500ef60b1b82..c5da1a5d93de 100644
--- a/Documentation/networking/packet_mmap.rst
+++ b/Documentation/networking/packet_mmap.rst
@@ -153,7 +153,7 @@ As capture, each frame contains two parts::
struct ifreq s_ifr;
...
- strncpy (s_ifr.ifr_name, "eth0", sizeof(s_ifr.ifr_name));
+ strscpy_pad (s_ifr.ifr_name, "eth0", sizeof(s_ifr.ifr_name));
/* get interface index of eth0 */
ioctl(this->socket, SIOCGIFINDEX, &s_ifr);
diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst
index 3f05d50ecd6e..571ba08386e7 100644
--- a/Documentation/networking/phy.rst
+++ b/Documentation/networking/phy.rst
@@ -292,6 +292,12 @@ Some of the interface modes are described below:
Note: due to legacy usage, some 10GBASE-R usage incorrectly makes
use of this definition.
+``PHY_INTERFACE_MODE_25GBASER``
+ This is the IEEE 802.3 PCS Clause 107 defined 25GBASE-R protocol.
+ The PCS is identical to 10GBASE-R, i.e. 64B/66B encoded
+ running 2.5 as fast, giving a fixed bit rate of 25.78125 Gbaud.
+ Please refer to the IEEE standard for further information.
+
``PHY_INTERFACE_MODE_100BASEX``
This defines IEEE 802.3 Clause 24. The link operates at a fixed data
rate of 125Mpbs using a 4B/5B encoding scheme, resulting in an underlying
diff --git a/Documentation/networking/pktgen.rst b/Documentation/networking/pktgen.rst
index 7afa1c9f1183..1225f0f63ff0 100644
--- a/Documentation/networking/pktgen.rst
+++ b/Documentation/networking/pktgen.rst
@@ -248,26 +248,24 @@ Usage:::
-i : ($DEV) output interface/device (required)
-s : ($PKT_SIZE) packet size
- -d : ($DEST_IP) destination IP
+ -d : ($DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed
-m : ($DST_MAC) destination MAC-addr
+ -p : ($DST_PORT) destination PORT range (e.g. 433-444) is also allowed
-t : ($THREADS) threads to start
+ -f : ($F_THREAD) index of first thread (zero indexed CPU number)
-c : ($SKB_CLONE) SKB clones send before alloc new SKB
+ -n : ($COUNT) num messages to send per thread, 0 means indefinitely
-b : ($BURST) HW level bursting of SKBs
-v : ($VERBOSE) verbose
-x : ($DEBUG) debug
+ -6 : ($IP6) IPv6
+ -w : ($DELAY) Tx Delay value (ns)
+ -a : ($APPEND) Script will not reset generator's state, but will append its config
The global variables being set are also listed. E.g. the required
interface/device parameter "-i" sets variable $DEV. Copy the
pktgen_sampleXX scripts and modify them to fit your own needs.
-The old scripts::
-
- pktgen.conf-1-2 # 1 CPU 2 dev
- pktgen.conf-1-1-rdos # 1 CPU 1 dev w. route DoS
- pktgen.conf-1-1-ip6 # 1 CPU 1 dev ipv6
- pktgen.conf-1-1-ip6-rdos # 1 CPU 1 dev ipv6 w. route DoS
- pktgen.conf-1-1-flows # 1 CPU 1 dev multiple flows.
-
Interrupt affinity
===================
@@ -398,7 +396,7 @@ Current commands and configuration options
References:
- ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/
-- tp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/
+- ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/
Paper from Linux-Kongress in Erlangen 2004.
- ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/pktgen_paper.pdf
diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst
index 7db3985359bc..a722eb30e014 100644
--- a/Documentation/networking/timestamping.rst
+++ b/Documentation/networking/timestamping.rst
@@ -625,7 +625,7 @@ interfaces of a DSA switch to share the same PHC.
By design, PTP timestamping with a DSA switch does not need any special
handling in the driver for the host port it is attached to. However, when the
host port also supports PTP timestamping, DSA will take care of intercepting
-the ``.ndo_do_ioctl`` calls towards the host port, and block attempts to enable
+the ``.ndo_eth_ioctl`` calls towards the host port, and block attempts to enable
hardware timestamping on it. This is because the SO_TIMESTAMPING API does not
allow the delivery of multiple hardware timestamps for the same packet, so
anybody else except for the DSA switch port must be prevented from doing so.
@@ -688,7 +688,7 @@ ethtool ioctl operations for them need to be mediated by their respective MAC
driver. Therefore, as opposed to DSA switches, modifications need to be done
to each individual MAC driver for PHY timestamping support. This entails:
-- Checking, in ``.ndo_do_ioctl``, whether ``phy_has_hwtstamp(netdev->phydev)``
+- Checking, in ``.ndo_eth_ioctl``, whether ``phy_has_hwtstamp(netdev->phydev)``
is true or not. If it is, then the MAC driver should not process this request
but instead pass it on to the PHY using ``phy_mii_ioctl()``.
@@ -747,7 +747,7 @@ For example, a typical driver design for TX timestamping might be to split the
transmission part into 2 portions:
1. "TX": checks whether PTP timestamping has been previously enabled through
- the ``.ndo_do_ioctl`` ("``priv->hwtstamp_tx_enabled == true``") and the
+ the ``.ndo_eth_ioctl`` ("``priv->hwtstamp_tx_enabled == true``") and the
current skb requires a TX timestamp ("``skb_shinfo(skb)->tx_flags &
SKBTX_HW_TSTAMP``"). If this is true, it sets the
"``skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS``" flag. Note: as
diff --git a/Documentation/networking/tipc.rst b/Documentation/networking/tipc.rst
index 76775f24cdc8..ab63d298cca2 100644
--- a/Documentation/networking/tipc.rst
+++ b/Documentation/networking/tipc.rst
@@ -4,10 +4,125 @@
Linux Kernel TIPC
=================
-TIPC (Transparent Inter Process Communication) is a protocol that is
-specially designed for intra-cluster communication.
+Introduction
+============
-For more information about TIPC, see http://tipc.sourceforge.net.
+TIPC (Transparent Inter Process Communication) is a protocol that is specially
+designed for intra-cluster communication. It can be configured to transmit
+messages either on UDP or directly across Ethernet. Message delivery is
+sequence guaranteed, loss free and flow controlled. Latency times are shorter
+than with any other known protocol, while maximal throughput is comparable to
+that of TCP.
+
+TIPC Features
+-------------
+
+- Cluster wide IPC service
+
+ Have you ever wished you had the convenience of Unix Domain Sockets even when
+ transmitting data between cluster nodes? Where you yourself determine the
+ addresses you want to bind to and use? Where you don't have to perform DNS
+ lookups and worry about IP addresses? Where you don't have to start timers
+ to monitor the continuous existence of peer sockets? And yet without the
+ downsides of that socket type, such as the risk of lingering inodes?
+
+ Welcome to the Transparent Inter Process Communication service, TIPC in short,
+ which gives you all of this, and a lot more.
+
+- Service Addressing
+
+ A fundamental concept in TIPC is that of Service Addressing which makes it
+ possible for a programmer to chose his own address, bind it to a server
+ socket and let client programs use only that address for sending messages.
+
+- Service Tracking
+
+ A client wanting to wait for the availability of a server, uses the Service
+ Tracking mechanism to subscribe for binding and unbinding/close events for
+ sockets with the associated service address.
+
+ The service tracking mechanism can also be used for Cluster Topology Tracking,
+ i.e., subscribing for availability/non-availability of cluster nodes.
+
+ Likewise, the service tracking mechanism can be used for Cluster Connectivity
+ Tracking, i.e., subscribing for up/down events for individual links between
+ cluster nodes.
+
+- Transmission Modes
+
+ Using a service address, a client can send datagram messages to a server socket.
+
+ Using the same address type, it can establish a connection towards an accepting
+ server socket.
+
+ It can also use a service address to create and join a Communication Group,
+ which is the TIPC manifestation of a brokerless message bus.
+
+ Multicast with very good performance and scalability is available both in
+ datagram mode and in communication group mode.
+
+- Inter Node Links
+
+ Communication between any two nodes in a cluster is maintained by one or two
+ Inter Node Links, which both guarantee data traffic integrity and monitor
+ the peer node's availability.
+
+- Cluster Scalability
+
+ By applying the Overlapping Ring Monitoring algorithm on the inter node links
+ it is possible to scale TIPC clusters up to 1000 nodes with a maintained
+ neighbor failure discovery time of 1-2 seconds. For smaller clusters this
+ time can be made much shorter.
+
+- Neighbor Discovery
+
+ Neighbor Node Discovery in the cluster is done by Ethernet broadcast or UDP
+ multicast, when any of those services are available. If not, configured peer
+ IP addresses can be used.
+
+- Configuration
+
+ When running TIPC in single node mode no configuration whatsoever is needed.
+ When running in cluster mode TIPC must as a minimum be given a node address
+ (before Linux 4.17) and told which interface to attach to. The "tipc"
+ configuration tool makes is possible to add and maintain many more
+ configuration parameters.
+
+- Performance
+
+ TIPC message transfer latency times are better than in any other known protocol.
+ Maximal byte throughput for inter-node connections is still somewhat lower than
+ for TCP, while they are superior for intra-node and inter-container throughput
+ on the same host.
+
+- Language Support
+
+ The TIPC user API has support for C, Python, Perl, Ruby, D and Go.
+
+More Information
+----------------
+
+- How to set up TIPC:
+
+ http://tipc.io/getting_started.html
+
+- How to program with TIPC:
+
+ http://tipc.io/programming.html
+
+- How to contribute to TIPC:
+
+- http://tipc.io/contacts.html
+
+- More details about TIPC specification:
+
+ http://tipc.io/protocol.html
+
+
+Implementation
+==============
+
+TIPC is implemented as a kernel module in net/tipc/ directory.
TIPC Base Types
---------------
diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst
index a59d1dd6fdcc..4d7087f727be 100644
--- a/Documentation/networking/tuntap.rst
+++ b/Documentation/networking/tuntap.rst
@@ -107,7 +107,7 @@ Note that the character pointer becomes overwritten with the real device name
*/
ifr.ifr_flags = IFF_TUN;
if( *dev )
- strncpy(ifr.ifr_name, dev, IFNAMSIZ);
+ strscpy_pad(ifr.ifr_name, dev, IFNAMSIZ);
if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ){
close(fd);
diff --git a/Documentation/networking/vrf.rst b/Documentation/networking/vrf.rst
index 0dde145043bc..0a9a6f968cb9 100644
--- a/Documentation/networking/vrf.rst
+++ b/Documentation/networking/vrf.rst
@@ -144,6 +144,19 @@ default VRF are only handled by a socket not bound to any VRF::
netfilter rules on the VRF device can be used to limit access to services
running in the default VRF context as well.
+Using VRF-aware applications (applications which simultaneously create sockets
+outside and inside VRFs) in conjunction with ``net.ipv4.tcp_l3mdev_accept=1``
+is possible but may lead to problems in some situations. With that sysctl
+value, it is unspecified which listening socket will be selected to handle
+connections for VRF traffic; ie. either a socket bound to the VRF or an unbound
+socket may be used to accept new connections from a VRF. This somewhat
+unexpected behavior can lead to problems if sockets are configured with extra
+options (ex. TCP MD5 keys) with the expectation that VRF traffic will
+exclusively be handled by sockets bound to VRFs, as would be the case with
+``net.ipv4.tcp_l3mdev_accept=0``. Finally and as a reminder, regardless of
+which listening socket is selected, established sockets will be created in the
+VRF based on the ingress interface, as documented earlier.
+
--------------------------------------------------------------------------------
Using iproute2 for VRFs