summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/devlink/mlx5.rst5
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/health.h1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c22
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c65
5 files changed, 94 insertions, 1 deletions
diff --git a/Documentation/networking/devlink/mlx5.rst b/Documentation/networking/devlink/mlx5.rst
index 196a4bb28df1..702f204a3dbd 100644
--- a/Documentation/networking/devlink/mlx5.rst
+++ b/Documentation/networking/devlink/mlx5.rst
@@ -135,7 +135,7 @@ Health reporters
tx reporter
-----------
-The tx reporter is responsible for reporting and recovering of the following two error scenarios:
+The tx reporter is responsible for reporting and recovering of the following three error scenarios:
- tx timeout
Report on kernel tx timeout detection.
@@ -143,6 +143,9 @@ The tx reporter is responsible for reporting and recovering of the following two
- tx error completion
Report on error tx completion.
Recover by flushing the tx queue and reset it.
+- tx PTP port timestamping CQ unhealthy
+ Report too many CQEs never delivered on port ts CQ.
+ Recover by flushing and re-creating all PTP channels.
tx reporter also support on demand diagnose callback, on which it provides
real time information of its send queues status.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
index 0107e4e73bb0..415840c3ef84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
@@ -18,6 +18,7 @@ void mlx5e_reporter_tx_create(struct mlx5e_priv *priv);
void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv);
void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq);
int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq);
+void mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq *ptpsq);
int mlx5e_health_cq_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg);
int mlx5e_health_cq_common_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
index 8680d21f3e7b..bb11e644d24f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
@@ -2,6 +2,7 @@
// Copyright (c) 2020 Mellanox Technologies
#include "en/ptp.h"
+#include "en/health.h"
#include "en/txrx.h"
#include "en/params.h"
#include "en/fs_tt_redirect.h"
@@ -140,6 +141,12 @@ mlx5e_ptp_metadata_map_remove(struct mlx5e_ptp_metadata_map *map, u16 metadata)
return skb;
}
+static bool mlx5e_ptp_metadata_map_unhealthy(struct mlx5e_ptp_metadata_map *map)
+{
+ /* Considered beginning unhealthy state if size * 15 / 2^4 cannot be reclaimed. */
+ return map->undelivered_counter > (map->capacity >> 4) * 15;
+}
+
static void mlx5e_ptpsq_mark_ts_cqes_undelivered(struct mlx5e_ptpsq *ptpsq,
ktime_t port_tstamp)
{
@@ -205,6 +212,9 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
out:
napi_consume_skb(skb, budget);
mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist, metadata_id);
+ if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) &&
+ !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
+ queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work);
}
static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget)
@@ -422,6 +432,14 @@ static void mlx5e_ptp_free_traffic_db(struct mlx5e_ptpsq *ptpsq)
kvfree(ptpsq->ts_cqe_pending_list);
}
+static void mlx5e_ptpsq_unhealthy_work(struct work_struct *work)
+{
+ struct mlx5e_ptpsq *ptpsq =
+ container_of(work, struct mlx5e_ptpsq, report_unhealthy_work);
+
+ mlx5e_reporter_tx_ptpsq_unhealthy(ptpsq);
+}
+
static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,
int txq_ix, struct mlx5e_ptp_params *cparams,
int tc, struct mlx5e_ptpsq *ptpsq)
@@ -451,6 +469,8 @@ static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,
if (err)
goto err_free_txqsq;
+ INIT_WORK(&ptpsq->report_unhealthy_work, mlx5e_ptpsq_unhealthy_work);
+
return 0;
err_free_txqsq:
@@ -464,6 +484,8 @@ static void mlx5e_ptp_close_txqsq(struct mlx5e_ptpsq *ptpsq)
struct mlx5e_txqsq *sq = &ptpsq->txqsq;
struct mlx5_core_dev *mdev = sq->mdev;
+ if (current_work() != &ptpsq->report_unhealthy_work)
+ cancel_work_sync(&ptpsq->report_unhealthy_work);
mlx5e_ptp_free_traffic_db(ptpsq);
cancel_work_sync(&sq->recover_work);
mlx5e_ptp_destroy_sq(mdev, sq->sqn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
index 7c5597d4589d..7b700d0f956a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
@@ -10,6 +10,7 @@
#include <linux/ktime.h>
#include <linux/ptp_classify.h>
#include <linux/time64.h>
+#include <linux/workqueue.h>
#define MLX5E_PTP_CHANNEL_IX 0
#define MLX5E_PTP_MAX_LOG_SQ_SIZE (8U)
@@ -34,6 +35,7 @@ struct mlx5e_ptpsq {
struct mlx5e_ptp_cq_stats *cq_stats;
u16 ts_cqe_ctr_mask;
+ struct work_struct report_unhealthy_work;
struct mlx5e_ptp_port_ts_cqe_list *ts_cqe_pending_list;
struct mlx5e_ptp_metadata_fifo metadata_freelist;
struct mlx5e_ptp_metadata_map metadata_map;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index b35ff289af49..ff8242f67c54 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -164,6 +164,43 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
return err;
}
+static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
+{
+ struct mlx5e_ptpsq *ptpsq = ctx;
+ struct mlx5e_channels *chs;
+ struct net_device *netdev;
+ struct mlx5e_priv *priv;
+ int carrier_ok;
+ int err;
+
+ if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &ptpsq->txqsq.state))
+ return 0;
+
+ priv = ptpsq->txqsq.priv;
+
+ mutex_lock(&priv->state_lock);
+ chs = &priv->channels;
+ netdev = priv->netdev;
+
+ carrier_ok = netif_carrier_ok(netdev);
+ netif_carrier_off(netdev);
+
+ mlx5e_deactivate_priv_channels(priv);
+
+ mlx5e_ptp_close(chs->ptp);
+ err = mlx5e_ptp_open(priv, &chs->params, chs->c[0]->lag_port, &chs->ptp);
+
+ mlx5e_activate_priv_channels(priv);
+
+ /* return carrier back if needed */
+ if (carrier_ok)
+ netif_carrier_on(netdev);
+
+ mutex_unlock(&priv->state_lock);
+
+ return err;
+}
+
/* state lock cannot be grabbed within this function.
* It can cause a dead lock or a read-after-free.
*/
@@ -516,6 +553,15 @@ static int mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv *priv, struct devlin
return mlx5e_tx_reporter_dump_sq(priv, fmsg, to_ctx->sq);
}
+static int mlx5e_tx_reporter_ptpsq_unhealthy_dump(struct mlx5e_priv *priv,
+ struct devlink_fmsg *fmsg,
+ void *ctx)
+{
+ struct mlx5e_ptpsq *ptpsq = ctx;
+
+ return mlx5e_tx_reporter_dump_sq(priv, fmsg, &ptpsq->txqsq);
+}
+
static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
struct devlink_fmsg *fmsg)
{
@@ -621,6 +667,25 @@ int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
return to_ctx.status;
}
+void mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq *ptpsq)
+{
+ struct mlx5e_ptp_metadata_map *map = &ptpsq->metadata_map;
+ char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
+ struct mlx5e_txqsq *txqsq = &ptpsq->txqsq;
+ struct mlx5e_cq *ts_cq = &ptpsq->ts_cq;
+ struct mlx5e_priv *priv = txqsq->priv;
+ struct mlx5e_err_ctx err_ctx = {};
+
+ err_ctx.ctx = ptpsq;
+ err_ctx.recover = mlx5e_tx_reporter_ptpsq_unhealthy_recover;
+ err_ctx.dump = mlx5e_tx_reporter_ptpsq_unhealthy_dump;
+ snprintf(err_str, sizeof(err_str),
+ "Unhealthy TX port TS queue: %d, SQ: 0x%x, CQ: 0x%x, Undelivered CQEs: %u Map Capacity: %u",
+ txqsq->ch_ix, txqsq->sqn, ts_cq->mcq.cqn, map->undelivered_counter, map->capacity);
+
+ mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
+}
+
static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
.name = "tx",
.recover = mlx5e_tx_reporter_recover,