target/linux/bcm27xx/patches-6.6/950-0464-drm-v3d-New-debugfs-end-points-to-query-GPU-usage-st.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493

From f0b0156b38d07a45e1b5309181efc645e6fe8393 Mon Sep 17 00:00:00 2001
From: Jose Maria Casanova Crespo <jmcasanova@igalia.com>
Date: Tue, 7 Feb 2023 13:54:02 +0100
Subject: [PATCH 0464/1085] drm/v3d: New debugfs end-points to query GPU usage
 stats.

Two new debugfs interfaces are implemented:

- gpu_usage: exposes the total runtime since boot of each
of the 5 scheduling queues available at V3D (BIN, RENDER,
CSD, TFU, CACHE_CLEAN). So if the interface is queried at
two different points of time the usage percentage of each
of the queues can be calculated.

- gpu_pid_usage: exposes the same information but to the
level of detail of each process using the V3D driver. The
runtime for process using the driver is stored. So the
percentages of usage by PID can be calculated with
measures at different timestamps.

The storage of gpu_pid_usage stats is only done if
the debugfs interface is polled during the last 70 seconds.
If a process does not submit a GPU job during last 70
seconds its stats will also be purged.

Signed-off-by: Jose Maria Casanova Crespo <jmcasanova@igalia.com>
---
 drivers/gpu/drm/v3d/v3d_debugfs.c |  79 +++++++++++++++++
 drivers/gpu/drm/v3d/v3d_drv.h     |  59 +++++++++++++
 drivers/gpu/drm/v3d/v3d_gem.c     |   1 +
 drivers/gpu/drm/v3d/v3d_irq.c     |   5 ++
 drivers/gpu/drm/v3d/v3d_sched.c   | 139 +++++++++++++++++++++++++++++-
 5 files changed, 282 insertions(+), 1 deletion(-)

--- a/drivers/gpu/drm/v3d/v3d_debugfs.c
+++ b/drivers/gpu/drm/v3d/v3d_debugfs.c
@@ -6,6 +6,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/string_helpers.h>
+#include <linux/sched/clock.h>
 
 #include <drm/drm_debugfs.h>
 
@@ -202,6 +203,82 @@ static int v3d_debugfs_bo_stats(struct s
 	return 0;
 }
 
+static int v3d_debugfs_gpu_usage(struct seq_file *m, void *unused)
+{
+	struct drm_debugfs_entry *entry = m->private;
+	struct drm_device *dev = entry->dev;
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct v3d_queue_stats *queue_stats;
+	enum v3d_queue queue;
+	u64 timestamp = local_clock();
+	u64 active_runtime;
+
+	seq_printf(m, "timestamp;%llu;\n", local_clock());
+	seq_printf(m, "\"QUEUE\";\"JOBS\";\"RUNTIME\";\"ACTIVE\";\n");
+	for (queue = 0; queue < V3D_MAX_QUEUES; queue++) {
+		if (!v3d->queue[queue].sched.ready)
+			continue;
+
+		queue_stats = &v3d->gpu_queue_stats[queue];
+		mutex_lock(&queue_stats->lock);
+		v3d_sched_stats_update(queue_stats);
+		if (queue_stats->last_pid)
+			active_runtime = timestamp - queue_stats->last_exec_start;
+		else
+			active_runtime = 0;
+
+		seq_printf(m, "%s;%d;%llu;%c;\n",
+			   v3d_queue_to_string(queue),
+			   queue_stats->jobs_sent,
+			   queue_stats->runtime + active_runtime,
+			   queue_stats->last_pid?'1':'0');
+		mutex_unlock(&queue_stats->lock);
+	}
+
+	return 0;
+}
+
+static int v3d_debugfs_gpu_pid_usage(struct seq_file *m, void *unused)
+{
+	struct drm_debugfs_entry *entry = m->private;
+	struct drm_device *dev = entry->dev;
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct v3d_queue_stats *queue_stats;
+	struct v3d_queue_pid_stats *cur;
+	enum v3d_queue queue;
+	u64 active_runtime;
+	u64 timestamp = local_clock();
+
+	seq_printf(m, "timestamp;%llu;\n", timestamp);
+	seq_printf(m, "\"QUEUE\";\"PID\",\"JOBS\";\"RUNTIME\";\"ACTIVE\";\n");
+	for (queue = 0; queue < V3D_MAX_QUEUES; queue++) {
+
+		if (!v3d->queue[queue].sched.ready)
+			continue;
+
+		queue_stats = &v3d->gpu_queue_stats[queue];
+		mutex_lock(&queue_stats->lock);
+		queue_stats->gpu_pid_stats_timeout = jiffies + V3D_QUEUE_STATS_TIMEOUT;
+		v3d_sched_stats_update(queue_stats);
+		list_for_each_entry(cur, &queue_stats->pid_stats_list, list) {
+
+			if (cur->pid == queue_stats->last_pid)
+				active_runtime = timestamp - queue_stats->last_exec_start;
+			else
+				active_runtime = 0;
+
+			seq_printf(m, "%s;%d;%d;%llu;%c;\n",
+				   v3d_queue_to_string(queue),
+				   cur->pid, cur->jobs_sent,
+				   cur->runtime + active_runtime,
+				   cur->pid == queue_stats->last_pid ? '1' : '0');
+		}
+		mutex_unlock(&queue_stats->lock);
+	}
+
+	return 0;
+}
+
 static int v3d_measure_clock(struct seq_file *m, void *unused)
 {
 	struct drm_debugfs_entry *entry = m->private;
@@ -241,6 +318,8 @@ static const struct drm_debugfs_info v3d
 	{"v3d_regs", v3d_v3d_debugfs_regs, 0},
 	{"measure_clock", v3d_measure_clock, 0},
 	{"bo_stats", v3d_debugfs_bo_stats, 0},
+	{"gpu_usage", v3d_debugfs_gpu_usage, 0},
+	{"gpu_pid_usage", v3d_debugfs_gpu_pid_usage, 0},
 };
 
 void
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -21,6 +21,19 @@ struct reset_control;
 
 #define V3D_MAX_QUEUES (V3D_CACHE_CLEAN + 1)
 
+static inline char *
+v3d_queue_to_string(enum v3d_queue queue)
+{
+	switch (queue) {
+	case V3D_BIN: return "v3d_bin";
+	case V3D_RENDER: return "v3d_render";
+	case V3D_TFU: return "v3d_tfu";
+	case V3D_CSD: return "v3d_csd";
+	case V3D_CACHE_CLEAN: return "v3d_cache_clean";
+	}
+	return "UNKNOWN";
+}
+
 struct v3d_queue_state {
 	struct drm_gpu_scheduler sched;
 
@@ -28,6 +41,44 @@ struct v3d_queue_state {
 	u64 emit_seqno;
 };
 
+struct v3d_queue_pid_stats {
+	struct	list_head list;
+	u64	runtime;
+	/* Time in jiffes.to purge the stats of this process. Every time a
+	 * process sends a new job to the queue, this timeout is delayed by
+	 * V3D_QUEUE_STATS_TIMEOUT while the gpu_pid_stats_timeout of the
+	 * queue is not reached.
+	 */
+	unsigned long timeout_purge;
+	u32	jobs_sent;
+	pid_t	pid;
+};
+
+struct v3d_queue_stats {
+	struct mutex lock;
+	u64	last_exec_start;
+	u64	last_exec_end;
+	u64	runtime;
+	u32	jobs_sent;
+	/* Time in jiffes to stop collecting gpu stats by process. This is
+	 * increased by every access to*the debugfs interface gpu_pid_usage.
+	 * If the debugfs is not used stats are not collected.
+	 */
+	unsigned long gpu_pid_stats_timeout;
+	pid_t	last_pid;
+	struct list_head pid_stats_list;
+};
+
+/* pid_stats by process (v3d_queue_pid_stats) are recorded if there is an
+ * access to the gpu_pid_usageare debugfs interface for the last
+ * V3D_QUEUE_STATS_TIMEOUT (70s).
+ *
+ * The same timeout is used to purge the stats by process for those process
+ * that have not sent jobs this period.
+ */
+#define V3D_QUEUE_STATS_TIMEOUT (70 * HZ)
+
+
 /* Performance monitor object. The perform lifetime is controlled by userspace
  * using perfmon related ioctls. A perfmon can be attached to a submit_cl
  * request, and when this is the case, HW perf counters will be activated just
@@ -147,6 +198,8 @@ struct v3d_dev {
 		u32 num_allocated;
 		u32 pages_allocated;
 	} bo_stats;
+
+	struct v3d_queue_stats gpu_queue_stats[V3D_MAX_QUEUES];
 };
 
 static inline struct v3d_dev *
@@ -244,6 +297,11 @@ struct v3d_job {
 	 */
 	struct v3d_perfmon *perfmon;
 
+	/* PID of the process that submitted the job that could be used to
+	 * for collecting stats by process of gpu usage.
+	 */
+	pid_t client_pid;
+
 	/* Callback for the freeing of the job on refcount going to 0. */
 	void (*free)(struct kref *ref);
 };
@@ -408,6 +466,7 @@ void v3d_mmu_remove_ptes(struct v3d_bo *
 /* v3d_sched.c */
 int v3d_sched_init(struct v3d_dev *v3d);
 void v3d_sched_fini(struct v3d_dev *v3d);
+void v3d_sched_stats_update(struct v3d_queue_stats *queue_stats);
 
 /* v3d_perfmon.c */
 void v3d_perfmon_get(struct v3d_perfmon *perfmon);
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -460,6 +460,7 @@ v3d_job_init(struct v3d_dev *v3d, struct
 	job = *container;
 	job->v3d = v3d;
 	job->free = free;
+	job->client_pid = current->pid;
 
 	ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue],
 				 v3d_priv);
--- a/drivers/gpu/drm/v3d/v3d_irq.c
+++ b/drivers/gpu/drm/v3d/v3d_irq.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/sched/clock.h>
 
 #include "v3d_drv.h"
 #include "v3d_regs.h"
@@ -100,6 +101,7 @@ v3d_irq(int irq, void *arg)
 	if (intsts & V3D_INT_FLDONE) {
 		struct v3d_fence *fence =
 			to_v3d_fence(v3d->bin_job->base.irq_fence);
+		v3d->gpu_queue_stats[V3D_BIN].last_exec_end = local_clock();
 
 		trace_v3d_bcl_irq(&v3d->drm, fence->seqno);
 		dma_fence_signal(&fence->base);
@@ -109,6 +111,7 @@ v3d_irq(int irq, void *arg)
 	if (intsts & V3D_INT_FRDONE) {
 		struct v3d_fence *fence =
 			to_v3d_fence(v3d->render_job->base.irq_fence);
+		v3d->gpu_queue_stats[V3D_RENDER].last_exec_end = local_clock();
 
 		trace_v3d_rcl_irq(&v3d->drm, fence->seqno);
 		dma_fence_signal(&fence->base);
@@ -118,6 +121,7 @@ v3d_irq(int irq, void *arg)
 	if (intsts & V3D_INT_CSDDONE) {
 		struct v3d_fence *fence =
 			to_v3d_fence(v3d->csd_job->base.irq_fence);
+		v3d->gpu_queue_stats[V3D_CSD].last_exec_end = local_clock();
 
 		trace_v3d_csd_irq(&v3d->drm, fence->seqno);
 		dma_fence_signal(&fence->base);
@@ -154,6 +158,7 @@ v3d_hub_irq(int irq, void *arg)
 	if (intsts & V3D_HUB_INT_TFUC) {
 		struct v3d_fence *fence =
 			to_v3d_fence(v3d->tfu_job->base.irq_fence);
+		v3d->gpu_queue_stats[V3D_TFU].last_exec_end = local_clock();
 
 		trace_v3d_tfu_irq(&v3d->drm, fence->seqno);
 		dma_fence_signal(&fence->base);
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/kthread.h>
+#include <linux/sched/clock.h>
 
 #include "v3d_drv.h"
 #include "v3d_regs.h"
@@ -72,6 +73,114 @@ v3d_switch_perfmon(struct v3d_dev *v3d,
 		v3d_perfmon_start(v3d, job->perfmon);
 }
 
+/*
+ * Updates the scheduling stats of the gpu queues runtime for completed jobs.
+ *
+ * It should be called before any new job submission to the queue or before
+ * accessing the stats from the debugfs interface.
+ *
+ * It is expected that calls to this function are done with queue_stats->lock
+ * locked.
+ */
+void
+v3d_sched_stats_update(struct v3d_queue_stats *queue_stats)
+{
+	struct list_head *pid_stats_list = &queue_stats->pid_stats_list;
+	struct v3d_queue_pid_stats *cur, *tmp;
+	u64 runtime = 0;
+	bool store_pid_stats =
+		time_is_after_jiffies(queue_stats->gpu_pid_stats_timeout);
+
+	/* If debugfs stats gpu_pid_usage has not been polled for a period,
+	 * the pid stats collection is stopped and we purge any existing
+	 * pid_stats.
+	 *
+	 * pid_stats are also purged for clients that have reached the
+	 * timeout_purge because the process probably does not exist anymore.
+	 */
+	list_for_each_entry_safe_reverse(cur, tmp, pid_stats_list, list) {
+		if (!store_pid_stats || time_is_before_jiffies(cur->timeout_purge)) {
+			list_del(&cur->list);
+			kfree(cur);
+		} else {
+			break;
+		}
+	}
+	/* If a job has finished its stats are updated. */
+	if (queue_stats->last_pid && queue_stats->last_exec_end) {
+		runtime = queue_stats->last_exec_end -
+			  queue_stats->last_exec_start;
+		queue_stats->runtime += runtime;
+
+		if (store_pid_stats) {
+			struct v3d_queue_pid_stats *pid_stats;
+			/* Last job info is always at the head of the list */
+			pid_stats = list_first_entry_or_null(pid_stats_list,
+				struct v3d_queue_pid_stats, list);
+			if (pid_stats &&
+			    pid_stats->pid == queue_stats->last_pid) {
+				pid_stats->runtime += runtime;
+			}
+		}
+		queue_stats->last_pid = 0;
+	}
+}
+
+/*
+ * Updates the queue usage adding the information of a new job that is
+ * about to be sent to the GPU to be executed.
+ */
+int
+v3d_sched_stats_add_job(struct v3d_queue_stats *queue_stats,
+			struct drm_sched_job *sched_job)
+{
+
+	struct v3d_queue_pid_stats *pid_stats = NULL;
+	struct v3d_job *job = sched_job?to_v3d_job(sched_job):NULL;
+	struct v3d_queue_pid_stats *cur;
+	struct list_head *pid_stats_list = &queue_stats->pid_stats_list;
+	int ret = 0;
+
+	mutex_lock(&queue_stats->lock);
+
+	/* Completion of previous job requires an update of its runtime stats */
+	v3d_sched_stats_update(queue_stats);
+
+	queue_stats->last_exec_start = local_clock();
+	queue_stats->last_exec_end = 0;
+	queue_stats->jobs_sent++;
+	queue_stats->last_pid = job->client_pid;
+
+	/* gpu usage stats by process are being collected */
+	if (time_is_after_jiffies(queue_stats->gpu_pid_stats_timeout)) {
+		list_for_each_entry(cur, pid_stats_list, list) {
+			if (cur->pid == job->client_pid) {
+				pid_stats = cur;
+				break;
+			}
+		}
+		/* pid_stats of this client is moved to the head of the list. */
+		if (pid_stats) {
+			list_move(&pid_stats->list, pid_stats_list);
+		} else {
+			pid_stats = kzalloc(sizeof(struct v3d_queue_pid_stats),
+					    GFP_KERNEL);
+			if (!pid_stats) {
+				ret = -ENOMEM;
+				goto err_mem;
+			}
+			pid_stats->pid = job->client_pid;
+			list_add(&pid_stats->list, pid_stats_list);
+		}
+		pid_stats->jobs_sent++;
+		pid_stats->timeout_purge = jiffies + V3D_QUEUE_STATS_TIMEOUT;
+	}
+
+err_mem:
+	mutex_unlock(&queue_stats->lock);
+	return ret;
+}
+
 static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
 {
 	struct v3d_bin_job *job = to_bin_job(sched_job);
@@ -107,6 +216,7 @@ static struct dma_fence *v3d_bin_job_run
 	trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno,
 			    job->start, job->end);
 
+	v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_BIN], sched_job);
 	v3d_switch_perfmon(v3d, &job->base);
 
 	/* Set the current and end address of the control list.
@@ -158,6 +268,7 @@ static struct dma_fence *v3d_render_job_
 	trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno,
 			    job->start, job->end);
 
+	v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_RENDER], sched_job);
 	v3d_switch_perfmon(v3d, &job->base);
 
 	/* XXX: Set the QCFG */
@@ -190,6 +301,7 @@ v3d_tfu_job_run(struct drm_sched_job *sc
 
 	trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
 
+	v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_TFU], sched_job);
 	V3D_WRITE(V3D_TFU_IIA, job->args.iia);
 	V3D_WRITE(V3D_TFU_IIS, job->args.iis);
 	V3D_WRITE(V3D_TFU_ICA, job->args.ica);
@@ -231,6 +343,7 @@ v3d_csd_job_run(struct drm_sched_job *sc
 
 	trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
 
+	v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_CSD], sched_job);
 	v3d_switch_perfmon(v3d, &job->base);
 
 	for (i = 1; i <= 6; i++)
@@ -247,7 +360,10 @@ v3d_cache_clean_job_run(struct drm_sched
 	struct v3d_job *job = to_v3d_job(sched_job);
 	struct v3d_dev *v3d = job->v3d;
 
+	v3d_sched_stats_add_job(&v3d->gpu_queue_stats[V3D_CACHE_CLEAN],
+				sched_job);
 	v3d_clean_caches(v3d);
+	v3d->gpu_queue_stats[V3D_CACHE_CLEAN].last_exec_end = local_clock();
 
 	return NULL;
 }
@@ -385,8 +501,18 @@ v3d_sched_init(struct v3d_dev *v3d)
 	int hw_jobs_limit = 1;
 	int job_hang_limit = 0;
 	int hang_limit_ms = 500;
+	enum v3d_queue q;
 	int ret;
 
+	for (q = 0; q < V3D_MAX_QUEUES; q++) {
+		INIT_LIST_HEAD(&v3d->gpu_queue_stats[q].pid_stats_list);
+		/* Setting timeout before current jiffies disables collecting
+		 * pid_stats on scheduling init.
+		 */
+		v3d->gpu_queue_stats[q].gpu_pid_stats_timeout = jiffies - 1;
+		mutex_init(&v3d->gpu_queue_stats[q].lock);
+	}
+
 	ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
 			     &v3d_bin_sched_ops,
 			     hw_jobs_limit, job_hang_limit,
@@ -440,9 +566,20 @@ void
 v3d_sched_fini(struct v3d_dev *v3d)
 {
 	enum v3d_queue q;
+	struct v3d_queue_stats *queue_stats;
 
 	for (q = 0; q < V3D_MAX_QUEUES; q++) {
-		if (v3d->queue[q].sched.ready)
+		if (v3d->queue[q].sched.ready) {
+			queue_stats = &v3d->gpu_queue_stats[q];
+			mutex_lock(&queue_stats->lock);
+			/* Setting gpu_pid_stats_timeout to jiffies-1 will
+			 * make v3d_sched_stats_update to purge all
+			 * allocated pid_stats.
+			 */
+			queue_stats->gpu_pid_stats_timeout = jiffies - 1;
+			v3d_sched_stats_update(queue_stats);
+			mutex_unlock(&queue_stats->lock);
 			drm_sched_fini(&v3d->queue[q].sched);
+		}
 	}
 }