bcm27xx: add 6.6 kernel patches

The patches were generated from the RPi repo with the following command: git format-patch v6.6.34..rpi-6.1.y Some patches needed rebasing and, as usual, the applied and reverted, wireless drivers, Github workflows, READMEs and defconfigs patches were removed. Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com>
author: Álvaro Fernández Rojas <noltari@gmail.com> 2024-05-10 13:19:19 +0200
committer: Álvaro Fernández Rojas <noltari@gmail.com> 2024-06-18 18:52:49 +0200
commit: 8c405cdcccadc824cfdd189550cb0fcedacd8245 (patch)
tree: aa90918a4153c6894c2fd591d8906ea09fca49b7 /target/linux/bcm27xx/patches-6.6/950-0464-drm-v3d-New-debugfs-end-points-to-query-GPU-usage-st.patch
parent: e8067fa108c5dff4c85cd6978b3b2cb3a1719641 (diff)
download: openwrt-8c405cdcccadc824cfdd189550cb0fcedacd8245.tar.gz
openwrt-8c405cdcccadc824cfdd189550cb0fcedacd8245.tar.bz2
openwrt-8c405cdcccadc824cfdd189550cb0fcedacd8245.zip
1 files changed, 493 insertions, 0 deletions
diff --git a/target/linux/bcm27xx/patches-6.6/950-0464-drm-v3d-New-debugfs-end-points-to-query-GPU-usage-st.patch b/target/linux/bcm27xx/patches-6.6/950-0464-drm-v3d-New-debugfs-end-points-to-query-GPU-usage-st.patch
new file mode 100644
index 0000000000..aef90564b8
--- /dev/null
+++ b/target/linux/bcm27xx/patches-6.6/950-0464-drm-v3d-New-debugfs-end-points-to-query-GPU-usage-st.patch
@@ -0,0 +1,493 @@
+From f0b0156b38d07a45e1b5309181efc645e6fe8393 Mon Sep 17 00:00:00 2001
+From: Jose Maria Casanova Crespo <jmcasanova@igalia.com>
+Date: Tue, 7 Feb 2023 13:54:02 +0100
+Subject: [PATCH 0464/1085] drm/v3d: New debugfs end-points to query GPU usage
+ stats.
+
+Two new debugfs interfaces are implemented:
+
+- gpu_usage: exposes the total runtime since boot of each
+of the 5 scheduling queues available at V3D (BIN, RENDER,
+CSD, TFU, CACHE_CLEAN). So if the interface is queried at
+two different points of time the usage percentage of each
+of the queues can be calculated.
+
+- gpu_pid_usage: exposes the same information but to the
+level of detail of each process using the V3D driver. The
+runtime for process using the driver is stored. So the
+percentages of usage by PID can be calculated with
+measures at different timestamps.
+
+The storage of gpu_pid_usage stats is only done if
+the debugfs interface is polled during the last 70 seconds.
+If a process does not submit a GPU job during last 70
+seconds its stats will also be purged.
+
+Signed-off-by: Jose Maria Casanova Crespo <jmcasanova@igalia.com>
+---
+ drivers/gpu/drm/v3d/v3d_debugfs.c |  79 +++++++++++++++++
+ drivers/gpu/drm/v3d/v3d_drv.h     |  59 +++++++++++++
+ drivers/gpu/drm/v3d/v3d_gem.c     |   1 +
+ drivers/gpu/drm/v3d/v3d_irq.c     |   5 ++
+ drivers/gpu/drm/v3d/v3d_sched.c   | 139 +++++++++++++++++++++++++++++-
+ 5 files changed, 282 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/v3d/v3d_debugfs.c
++++ b/drivers/gpu/drm/v3d/v3d_debugfs.c
+@@ -6,6 +6,7 @@
+ #include <linux/debugfs.h>
+ #include <linux/seq_file.h>
+ #include <linux/string_helpers.h>
++#include <linux/sched/clock.h>
+ 
+ #include <drm/drm_debugfs.h>
+ 
+@@ -202,6 +203,82 @@ static int v3d_debugfs_bo_stats(struct s
+ 	return 0;
+ }
+ 
++static int v3d_debugfs_gpu_usage(struct seq_file *m, void *unused)
++{
++	struct drm_debugfs_entry *entry = m->private;
++	struct drm_device *dev = entry->dev;
++	struct v3d_dev *v3d = to_v3d_dev(dev);
++	struct v3d_queue_stats *queue_stats;
++	enum v3d_queue queue;
++	u64 timestamp = local_clock();
++	u64 active_runtime;
++
++	seq_printf(m, "timestamp;%llu;\n", local_clock());
++	seq_printf(m, "\"QUEUE\";\"JOBS\";\"RUNTIME\";\"ACTIVE\";\n");
++	for (queue = 0; queue < V3D_MAX_QUEUES; queue++) {
++		if (!v3d->queue[queue].sched.ready)
++			continue;
++
++		queue_stats = &v3d->gpu_queue_stats[queue];
++		mutex_lock(&queue_stats->lock);
++		v3d_sched_stats_update(queue_stats);
++		if (queue_stats->last_pid)
++			active_runtime = timestamp - queue_stats->last_exec_start;
++		else
++			active_runtime = 0;
++
++		seq_printf(m, "%s;%d;%llu;%c;\n",
++			   v3d_queue_to_string(queue),
++			   queue_stats->jobs_sent,
++			   queue_stats->runtime + active_runtime,
++			   queue_stats->last_pid?'1':'0');
++		mutex_unlock(&queue_stats->lock);
++	}
++
++	return 0;
++}
++
++static int v3d_debugfs_gpu_pid_usage(struct seq_file *m, void *unused)
++{
++	struct drm_debugfs_entry *entry = m->private;
++	struct drm_device *dev = entry->dev;
++	struct v3d_dev *v3d = to_v3d_dev(dev);
++	struct v3d_queue_stats *queue_stats;
++	struct v3d_queue_pid_stats *cur;
++	enum v3d_queue queue;
++	u64 active_runtime;
++	u64 timestamp = local_clock();
++
++	seq_printf(m, "timestamp;%llu;\n", timestamp);
++	seq_printf(m, "\"QUEUE\";\"PID\",\"JOBS\";\"RUNTIME\";\"ACTIVE\";\n");
++	for (queue = 0; queue < V3D_MAX_QUEUES; queue++) {
++
++		if (!v3d->queue[queue].sched.ready)
++			continue;
++
++		queue_stats = &v3d->gpu_queue_stats[queue];
++		mutex_lock(&queue_stats->lock);
++		queue_stats->gpu_pid_stats_timeout = jiffies + V3D_QUEUE_STATS_TIMEOUT;
++		v3d_sched_stats_update(queue_stats);
++		list_for_each_entry(cur, &queue_stats->pid_stats_list, list) {
++
++			if (cur->pid == queue_stats->last_pid)
++				active_runtime = timestamp - queue_stats->last_exec_start;
++			else
++				active_runtime = 0;
++
++			seq_printf(m, "%s;%d;%d;%llu;%c;\n",
++				   v3d_queue_to_string(queue),
++				   cur->pid, cur->jobs_sent,
++				   cur->runtime + active_runtime,
++				   cur->pid == queue_stats->last_pid ? '1' : '0');
++		}
++		mutex_unlock(&queue_stats->lock);
++	}
++
++	return 0;
++}
++
+ static int v3d_measure_clock(struct seq_file *m, void *unused)
+ {
+ 	struct drm_debugfs_entry *entry = m->private;
+@@ -241,6 +318,8 @@ static const struct drm_debugfs_info v3d
+ 	{"v3d_regs", v3d_v3d_debugfs_regs, 0},
+ 	{"measure_clock", v3d_measure_clock, 0},
+ 	{"bo_stats", v3d_debugfs_bo_stats, 0},
++	{"gpu_usage", v3d_debugfs_gpu_usage, 0},
++	{"gpu_pid_usage", v3d_debugfs_gpu_pid_usage, 0},
+ };
+ 
+ void
+--- a/drivers/gpu/drm/v3d/v3d_drv.h
++++ b/drivers/gpu/drm/v3d/v3d_drv.h
+@@ -21,6 +21,19 @@ struct reset_control;
+ 
+ #define V3D_MAX_QUEUES (V3D_CACHE_CLEAN + 1)
+ 
++static inline char *
++v3d_queue_to_string(enum v3d_queue queue)
++{
++	switch (queue) {
++	case V3D_BIN: return "v3d_bin";
++	case V3D_RENDER: return "v3d_render";
++	case V3D_TFU: return "v3d_tfu";
++	case V3D_CSD: return "v3d_csd";
++	case V3D_CACHE_CLEAN: return "v3d_cache_clean";
++	}
++	return "UNKNOWN";
++}
++
+ struct v3d_queue_state {
+ 	struct drm_gpu_scheduler sched;
+ 
+@@ -28,6 +41,44 @@ struct v3d_queue_state {
+ 	u64 emit_seqno;
+ };
+ 
++struct v3d_queue_pid_stats {
++	struct	list_head list;
++	u64	runtime;
++	/* Time in jiffes.to purge the stats of this process. Every time a
++	 * process sends a new job to the queue, this timeout is delayed by
++	 * V3D_QUEUE_STATS_TIMEOUT while the gpu_pid_stats_timeout of the
++	 * queue is not reached.
++	 */
++	unsigned long timeout_purge;
++	u32	jobs_sent;
++	pid_t	pid;
++};
++
++struct v3d_queue_stats {
++	struct mutex lock;
++	u64	last_exec_start;
++	u64	last_exec_end;
++	u64	runtime;
++	u32	jobs_sent;
++	/* Time in jiffes to stop collecting gpu stats by process. This is
++	 * increased by every access to*the debugfs interface gpu_pid_usage.
++	 * If the debugfs is not used stats are not collected.
++	 */
++	unsigned long gpu_pid_stats_timeout;
++	pid_t	last_pid;
++	struct list_head pid_stats_list;
++};
++
++/* pid_stats by process (v3d_queue_pid_stats) are recorded if there is an
++ * access to the gpu_pid_usageare debugfs interface for the last
++ * V3D_QUEUE_STATS_TIMEOUT (70s).
++ *
++ * The same timeout is used to purge the stats by process for those process
++ * that have not sent jobs this period.
++ */
++#define V3D_QUEUE_STATS_TIMEOUT (70 * HZ)
++
++
+ /* Performance monitor object. The perform lifetime is controlled by userspace
+  * using perfmon related ioctls. A perfmon can be attached to a submit_cl
+  * request, and when this is the case, HW perf counters will be activated just
+@@ -147,6 +198,8 @@ struct v3d_dev {
+ 		u32 num_allocated;
+ 		u32 pages_allocated;
+ 	} bo_stats;
++
++	struct v3d_queue_stats gpu_queue_stats[V3D_MAX_QUEUES];
+ };
+ 
+ static inline struct v3d_dev *
+@@ -244,6 +297,11 @@ struct v3d_job {
+ 	 */
+ 	struct v3d_perfmon *perfmon;
+ 
++	/* PID of the process that submitted the job that could be used to
++	 * for collecting stats by process of gpu usage.
++	 */
++	pid_t client_pid;
++
+ 	/* Callback for the freeing of the job on refcount going to 0. */
+ 	void (*free)(struct kref *ref);
+ };
+@@ -408,6 +466,7 @@ void v3d_mmu_remove_ptes(struct v3d_bo *
+ /* v3d_sched.c */
+ int v3d_sched_init(struct v3d_dev *v3d);
+ void v3d_sched_fini(struct v3d_dev *v3d);
++void v3d_sched_stats_update(struct v3d_queue_stats *queue_stats);
+ 
+ /* v3d_perfmon.c */
+ void v3d_perfmon_get(struct v3d_perfmon *perfmon);
+--- a/drivers/gpu/drm/v3d/v3d_gem.c
++++ b/drivers/gpu/drm/v3d/v3d_gem.c
+@@ -460,6 +460,7 @@ v3d_job_init(struct v3d_dev *v3d, struct
+ 	job = *container;
+ 	job->v3d = v3d;
+ 	job->free = free;
++	job->client_pid = current->pid;
+ 
+ 	ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue],
+ 				 v3d_priv);
+--- a/drivers/gpu/drm/v3d/v3d_irq.c
++++ b/drivers/gpu/drm/v3d/v3d_irq.c
+@@ -14,6 +14,7 @@
+  */
+ 
+ #include <linux/platform_device.h>
++#include <linux/sched/clock.h>
+ 
+ #include "v3d_drv.h"
+ #include "v3d_regs.h"
+@@ -100,6 +101,7 @@ v3d_irq(int irq, void *arg)
+ 	if (intsts & V3D_INT_FLDONE) {
+ 		struct v3d_fence *fence =
+ 			to_v3d_fence(v3d->bin_job->base.irq_fence);
++		v3d->gpu_queue_stats[V3D_BIN].last_exec_end = local_clock();
+ 
+ 		trace_v3d_bcl_irq(&v3d->drm, fence->seqno);
+ 		dma_fence_signal(&fence->base);
+@@ -109,6 +111,7 @@ v3d_irq(int irq, void *arg)
+ 	if (intsts & V3D_INT_FRDONE) {
+ 		struct v3d_fence *fence =
+ 			to_v3d_fence(v3d->render_job->base.irq_fence);
++		v3d->gpu_queue_stats[V3D_RENDER].last_exec_end = local_clock();
+ 
+ 		trace_v3d_rcl_irq(&v3d->drm, fence->seqno);
+ 		dma_fence_signal(&fence->base);
+@@ -118,6 +121,7 @@ v3d_irq(int irq, void *arg)
+ 	if (intsts & V3D_INT_CSDDONE) {
+ 		struct v3d_fence *fence =
+ 			to_v3d_fence(v3d->csd_job->base.irq_fence);
++		v3d->gpu_queue_stats[V3D_CSD].last_exec_end = local_clock();
+ 
+ 		trace_v3d_csd_irq(&v3d->drm, fence->seqno);
+ 		dma_fence_signal(&fence->base);
+@@ -154,6 +158,7 @@ v3d_hub_irq(int irq, void *arg)
+ 	if (intsts & V3D_HUB_INT_TFUC) {
+ 		struct v3d_fence *fence =
+ 			to_v3d_fence(v3d->tfu_job->base.irq_fence);
++		v3d->gpu_queue_stats[V3D_TFU].last_exec_end = local_clock();
+ 
+ 		trace_v3d_tfu_irq(&v3d->drm, fence->seqno);
+ 		dma_fence_signal(&fence->base);
+--- a/drivers/gpu/drm/v3d/v3d_sched.c
++++ b/drivers/gpu/drm/v3d/v3d_sched.c
+@@ -19,6 +19,7 @@
+  */
+ 
+ #include <linux/kthread.h>
++#include <linux/sched/clock.h>
+ 
+ #include "v3d_drv.h"
+ #include "v3d_regs.h"
+@@ -72,6 +73,114 @@ v3d_switch_perfmon(struct v3d_dev *v3d,
+ 		v3d_perfmon_start(v3d, job->perfmon);
+ }
+ 
++/*
++ * Updates the scheduling stats of the gpu queues runtime for completed jobs.
++ *
++ * It should be called before any new job submission to the queue or before
++ * accessing the stats from the debugfs interface.
++ *
++ * It is expected that calls to this function are done with queue_stats->lock
++ * locked.
++ */
++void
++v3d_sched_stats_update(struct v3d_queue_stats *queue_stats)
++{
++	struct list_head *pid_stats_list = &queue_stats->pid_stats_list;
++	struct v3d_queue_pid_stats *cur, *tmp;
++	u64 runtime = 0;
++	bool store_pid_stats =
++		time_is_after_jiffies(queue_stats->gpu_pid_stats_timeout);
++
++	/* If debugfs stats gpu_pid_usage has not been polled for a period,
++	 * the pid stats collection is stopped and we purge any existing
++	 * pid_stats.
++	 *
++	 * pid_stats are also purged for clients that have reached the
++	 * timeout_purge because the process probably does not exist anymore.
++	 */
++	list_for_each_entry_safe_reverse(cur, tmp, pid_stats_list, list) {
++		if (!store_pid_stats || time_is_before_jiffies(cur->timeout_purge)) {
++			list_del(&cur->list);
++			kfree(cur);
++		} else {
++			break;
++		}
++	}
++	/* If a job has finished its stats are updated. */
++	if (queue_stats->last_pid && queue_stats->last_exec_end) {
++		runtime = queue_stats->last_exec_end -
++			  queue_stats->last_exec_start;
++		queue_stats->runtime += runtime;
++
++		if (store_pid_stats) {
++			struct v3d_queue_pid_stats *pid_stats;
++			/* Last job info is always at the head of the list */
++			pid_stats = list_first_entry_or_null(pid_stats_list,
++				struct v3d_queue_pid_stats, list);
++			if (pid_stats &&
++			    pid_stats->pid == queue_stats->last_pid) {
++				pid_stats->runtime += runtime;
++			}
++		}
++		queue_stats->last_pid = 0;
++	}
++}
++
++/*
++ * Updates the queue usage adding the information of a new job that is
++ * about to be sent to the GPU to be executed.
++ */
++int
++v3d_sched_stats_add_job(struct v3d_queue_stats *queue_stats,
++			struct drm_sched_job *sched_job)
++{
++
++	struct v3d_queue_pid_stats *pid_stats = NULL;
++	struct v3d_job *job = sched_job?to_v3d_job(sched_job):NULL;
++	struct v3d_queue_pid_stats *cur;
++	struct list_head *pid_stats_list = &queue_stats->pid_stats_list;
++	int ret = 0;
++
++	mutex_lock(&queue_stats->lock);
++
++	/* Completion of previous job requires an update of its runtime stats */
++	v3d_sched_stats_update(queue_stats);
++
++	queue_stats->last_exec_start = local_clock();
++	queue_stats->last_exec_end = 0;
++	queue_stats->jobs_sent++;
++	queue_stats->last_pid = job->client_pid;
++
++	/* gpu usage stats by process are being collected */
++	if (time_is_after_jiffies(queue_stats->gpu_pid_stats_timeout)) {
++		list_for_each_entry(cur, pid_stats_list, list) {
++			if (cur->pid == job->client_pid) {
++				pid_stats = cur;
++				break;
++			}
++		}
++		/* pid_stats of this client is moved to the head of the list. */
++		if (pid_stats) {
++			list_move(&pid_stats->list, pid_stats_list);
++		} else {
++			pid_stats = kzalloc(sizeof(struct v3d_queue_pid_stats),
++					    GFP_KERNEL);
++			if (!pid_stats) {
++				ret = -ENOMEM;
++				goto err_mem;
++			}
++			pid_stats->pid = job->client_pid;
++			list_add(&pid_stats->list, pid_stats_list);
++		}
++		pid_stats->jobs_sent++;
++		pid_stats->timeout_purge = jiffies + V3D_QUEUE_STATS_TIMEOUT;
++	}
++
++err_mem:
++	mutex_unlock(&queue_stats->lock);
++	return ret;
++}
++
+ static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
+ {
+ 	struct v3d_bin_job *job = to_bin_job(sched_job);
+@@ -107,6 +216,7 @@ static struct dma_fence *v3d_bin_job_run
+ 	trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno,
+ 			    job->start, job->end);
+ 
++	v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_BIN], sched_job);
+ 	v3d_switch_perfmon(v3d, &job->base);
+ 
+ 	/* Set the current and end address of the control list.
+@@ -158,6 +268,7 @@ static struct dma_fence *v3d_render_job_
+ 	trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno,
+ 			    job->start, job->end);
+ 
++	v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_RENDER], sched_job);
+ 	v3d_switch_perfmon(v3d, &job->base);
+ 
+ 	/* XXX: Set the QCFG */
+@@ -190,6 +301,7 @@ v3d_tfu_job_run(struct drm_sched_job *sc
+ 
+ 	trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
+ 
++	v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_TFU], sched_job);
+ 	V3D_WRITE(V3D_TFU_IIA, job->args.iia);
+ 	V3D_WRITE(V3D_TFU_IIS, job->args.iis);
+ 	V3D_WRITE(V3D_TFU_ICA, job->args.ica);
+@@ -231,6 +343,7 @@ v3d_csd_job_run(struct drm_sched_job *sc
+ 
+ 	trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
+ 
++	v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_CSD], sched_job);
+ 	v3d_switch_perfmon(v3d, &job->base);
+ 
+ 	for (i = 1; i <= 6; i++)
+@@ -247,7 +360,10 @@ v3d_cache_clean_job_run(struct drm_sched
+ 	struct v3d_job *job = to_v3d_job(sched_job);
+ 	struct v3d_dev *v3d = job->v3d;
+ 
++	v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_CACHE_CLEAN],
++				sched_job);
+ 	v3d_clean_caches(v3d);
++	v3d->gpu_queue_stats[V3D_CACHE_CLEAN].last_exec_end = local_clock();
+ 
+ 	return NULL;
+ }
+@@ -385,8 +501,18 @@ v3d_sched_init(struct v3d_dev *v3d)
+ 	int hw_jobs_limit = 1;
+ 	int job_hang_limit = 0;
+ 	int hang_limit_ms = 500;
++	enum v3d_queue q;
+ 	int ret;
+ 
++	for (q = 0; q < V3D_MAX_QUEUES; q++) {
++		INIT_LIST_HEAD(&v3d->gpu_queue_stats[q].pid_stats_list);
++		/* Setting timeout before current jiffies disables collecting
++		 * pid_stats on scheduling init.
++		 */
++		v3d->gpu_queue_stats[q].gpu_pid_stats_timeout = jiffies - 1;
++		mutex_init(&v3d->gpu_queue_stats[q].lock);
++	}
++
+ 	ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
+ 			     &v3d_bin_sched_ops,
+ 			     hw_jobs_limit, job_hang_limit,
+@@ -440,9 +566,20 @@ void
+ v3d_sched_fini(struct v3d_dev *v3d)
+ {
+ 	enum v3d_queue q;
++	struct v3d_queue_stats *queue_stats;
+ 
+ 	for (q = 0; q < V3D_MAX_QUEUES; q++) {
+-		if (v3d->queue[q].sched.ready)
++		if (v3d->queue[q].sched.ready) {
++			queue_stats = &v3d->gpu_queue_stats[q];
++			mutex_lock(&queue_stats->lock);
++			/* Setting gpu_pid_stats_timeout to jiffies-1 will
++			 * make v3d_sched_stats_update to purge all
++			 * allocated pid_stats.
++			 */
++			queue_stats->gpu_pid_stats_timeout = jiffies - 1;
++			v3d_sched_stats_update(queue_stats);
++			mutex_unlock(&queue_stats->lock);
+ 			drm_sched_fini(&v3d->queue[q].sched);
++		}
+ 	}
+ }
author	Álvaro Fernández Rojas <noltari@gmail.com>	2024-05-10 13:19:19 +0200
committer	Álvaro Fernández Rojas <noltari@gmail.com>	2024-06-18 18:52:49 +0200
commit	8c405cdcccadc824cfdd189550cb0fcedacd8245 (patch)
tree	aa90918a4153c6894c2fd591d8906ea09fca49b7 /target/linux/bcm27xx/patches-6.6/950-0464-drm-v3d-New-debugfs-end-points-to-query-GPU-usage-st.patch
parent	e8067fa108c5dff4c85cd6978b3b2cb3a1719641 (diff)
download	openwrt-8c405cdcccadc824cfdd189550cb0fcedacd8245.tar.gz openwrt-8c405cdcccadc824cfdd189550cb0fcedacd8245.tar.bz2 openwrt-8c405cdcccadc824cfdd189550cb0fcedacd8245.zip