[RFC] drm/msm: Add GPU memory traces
Commit Message
From: Rob Clark <robdclark@chromium.org>
Perfetto can use these traces to track global and per-process GPU memory
usage.
Signed-off-by: Rob Clark <robdclark@chromium.org>
---
I realized the tracepoint that perfetto uses to show GPU memory usage
globally and per-process was already upstream, but with no users.
This overlaps a bit with fdinfo, but ftrace is a lighter weight
mechanism and fits better with perfetto (plus is already supported in
trace_processor and perfetto UI, whereas something fdinfo based would
require new code to be added in perfetto.
We could probably do this more globally (ie. drm_gem_get/put_pages() and
drm_gem_handle_create_tail()/drm_gem_object_release_handle() if folks
prefer. Not sure where that leaves the TTM drivers.
drivers/gpu/drm/msm/Kconfig | 1 +
drivers/gpu/drm/msm/msm_drv.h | 5 +++++
drivers/gpu/drm/msm/msm_gem.c | 37 +++++++++++++++++++++++++++++++++++
drivers/gpu/drm/msm/msm_gpu.h | 8 ++++++++
4 files changed, 51 insertions(+)
Comments
On Fri, Mar 1, 2024 at 10:53 AM Rob Clark <robdclark@gmail.com> wrote:
>
> From: Rob Clark <robdclark@chromium.org>
>
> Perfetto can use these traces to track global and per-process GPU memory
> usage.
>
> Signed-off-by: Rob Clark <robdclark@chromium.org>
> ---
> I realized the tracepoint that perfetto uses to show GPU memory usage
> globally and per-process was already upstream, but with no users.
>
> This overlaps a bit with fdinfo, but ftrace is a lighter weight
> mechanism and fits better with perfetto (plus is already supported in
> trace_processor and perfetto UI, whereas something fdinfo based would
> require new code to be added in perfetto.
Side-note, I'm also investigating mesa based perfetto memory traces,
which can give a more granular view (ie. breakdown of memory used for
image/buffer/cmdstream/cache/etc), but not a global view. And the
userspace based traces have the unfortunate design decision to trace
incremental rather than absolute total values, so results can be
incorrect if traces are dropped. So neither userspace based nor
kernel based gpu memory traces are an adequate replacement for the
other.
BR,
-R
> We could probably do this more globally (ie. drm_gem_get/put_pages() and
> drm_gem_handle_create_tail()/drm_gem_object_release_handle() if folks
> prefer. Not sure where that leaves the TTM drivers.
>
> drivers/gpu/drm/msm/Kconfig | 1 +
> drivers/gpu/drm/msm/msm_drv.h | 5 +++++
> drivers/gpu/drm/msm/msm_gem.c | 37 +++++++++++++++++++++++++++++++++++
> drivers/gpu/drm/msm/msm_gpu.h | 8 ++++++++
> 4 files changed, 51 insertions(+)
>
> diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
> index f202f26adab2..e4c912fcaf22 100644
> --- a/drivers/gpu/drm/msm/Kconfig
> +++ b/drivers/gpu/drm/msm/Kconfig
> @@ -33,6 +33,7 @@ config DRM_MSM
> select PM_OPP
> select NVMEM
> select PM_GENERIC_DOMAINS
> + select TRACE_GPU_MEM
> help
> DRM/KMS driver for MSM/snapdragon.
>
> diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
> index 16a7cbc0b7dd..cb8f7e804b5b 100644
> --- a/drivers/gpu/drm/msm/msm_drv.h
> +++ b/drivers/gpu/drm/msm/msm_drv.h
> @@ -137,6 +137,11 @@ struct msm_drm_private {
> struct msm_rd_state *hangrd; /* debugfs to dump hanging submits */
> struct msm_perf_state *perf;
>
> + /**
> + * total_mem: Total/global amount of memory backing GEM objects.
> + */
> + atomic64_t total_mem;
> +
> /**
> * List of all GEM objects (mainly for debugfs, protected by obj_lock
> * (acquire before per GEM object lock)
> diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
> index 175ee4ab8a6f..e04c4af5d154 100644
> --- a/drivers/gpu/drm/msm/msm_gem.c
> +++ b/drivers/gpu/drm/msm/msm_gem.c
> @@ -12,6 +12,9 @@
> #include <linux/pfn_t.h>
>
> #include <drm/drm_prime.h>
> +#include <drm/drm_file.h>
> +
> +#include <trace/events/gpu_mem.h>
>
> #include "msm_drv.h"
> #include "msm_fence.h"
> @@ -33,6 +36,34 @@ static bool use_pages(struct drm_gem_object *obj)
> return !msm_obj->vram_node;
> }
>
> +static void update_device_mem(struct msm_drm_private *priv, ssize_t size)
> +{
> + uint64_t total_mem = atomic64_add_return(size, &priv->total_mem);
> + trace_gpu_mem_total(0, 0, total_mem);
> +}
> +
> +static void update_ctx_mem(struct drm_file *file, ssize_t size)
> +{
> + struct msm_file_private *ctx = file->driver_priv;
> + uint64_t ctx_mem = atomic64_add_return(size, &ctx->ctx_mem);
> +
> + rcu_read_lock(); /* Locks file->pid! */
> + trace_gpu_mem_total(0, pid_nr(file->pid), ctx_mem);
> + rcu_read_unlock();
> +
> +}
> +
> +static int msm_gem_open(struct drm_gem_object *obj, struct drm_file *file)
> +{
> + update_ctx_mem(file, obj->size);
> + return 0;
> +}
> +
> +static void msm_gem_close(struct drm_gem_object *obj, struct drm_file *file)
> +{
> + update_ctx_mem(file, -obj->size);
> +}
> +
> /*
> * Cache sync.. this is a bit over-complicated, to fit dma-mapping
> * API. Really GPU cache is out of scope here (handled on cmdstream)
> @@ -156,6 +187,8 @@ static struct page **get_pages(struct drm_gem_object *obj)
> return p;
> }
>
> + update_device_mem(dev->dev_private, obj->size);
> +
> msm_obj->pages = p;
>
> msm_obj->sgt = drm_prime_pages_to_sg(obj->dev, p, npages);
> @@ -209,6 +242,8 @@ static void put_pages(struct drm_gem_object *obj)
> msm_obj->sgt = NULL;
> }
>
> + update_device_mem(obj->dev->dev_private, -obj->size);
> +
> if (use_pages(obj))
> drm_gem_put_pages(obj, msm_obj->pages, true, false);
> else
> @@ -1118,6 +1153,8 @@ static const struct vm_operations_struct vm_ops = {
>
> static const struct drm_gem_object_funcs msm_gem_object_funcs = {
> .free = msm_gem_free_object,
> + .open = msm_gem_open,
> + .close = msm_gem_close,
> .pin = msm_gem_prime_pin,
> .unpin = msm_gem_prime_unpin,
> .get_sg_table = msm_gem_prime_get_sg_table,
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index 2bfcb222e353..f7d2a7d6f8cc 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -428,6 +428,14 @@ struct msm_file_private {
> * level.
> */
> struct drm_sched_entity *entities[NR_SCHED_PRIORITIES * MSM_GPU_MAX_RINGS];
> +
> + /**
> + * ctx_mem:
> + *
> + * Total amount of memory of GEM buffers with handles attached for
> + * this context.
> + */
> + atomic64_t ctx_mem;
> };
>
> /**
> --
> 2.44.0
>
@@ -33,6 +33,7 @@ config DRM_MSM
select PM_OPP
select NVMEM
select PM_GENERIC_DOMAINS
+ select TRACE_GPU_MEM
help
DRM/KMS driver for MSM/snapdragon.
@@ -137,6 +137,11 @@ struct msm_drm_private {
struct msm_rd_state *hangrd; /* debugfs to dump hanging submits */
struct msm_perf_state *perf;
+ /**
+ * total_mem: Total/global amount of memory backing GEM objects.
+ */
+ atomic64_t total_mem;
+
/**
* List of all GEM objects (mainly for debugfs, protected by obj_lock
* (acquire before per GEM object lock)
@@ -12,6 +12,9 @@
#include <linux/pfn_t.h>
#include <drm/drm_prime.h>
+#include <drm/drm_file.h>
+
+#include <trace/events/gpu_mem.h>
#include "msm_drv.h"
#include "msm_fence.h"
@@ -33,6 +36,34 @@ static bool use_pages(struct drm_gem_object *obj)
return !msm_obj->vram_node;
}
+static void update_device_mem(struct msm_drm_private *priv, ssize_t size)
+{
+ uint64_t total_mem = atomic64_add_return(size, &priv->total_mem);
+ trace_gpu_mem_total(0, 0, total_mem);
+}
+
+static void update_ctx_mem(struct drm_file *file, ssize_t size)
+{
+ struct msm_file_private *ctx = file->driver_priv;
+ uint64_t ctx_mem = atomic64_add_return(size, &ctx->ctx_mem);
+
+ rcu_read_lock(); /* Locks file->pid! */
+ trace_gpu_mem_total(0, pid_nr(file->pid), ctx_mem);
+ rcu_read_unlock();
+
+}
+
+static int msm_gem_open(struct drm_gem_object *obj, struct drm_file *file)
+{
+ update_ctx_mem(file, obj->size);
+ return 0;
+}
+
+static void msm_gem_close(struct drm_gem_object *obj, struct drm_file *file)
+{
+ update_ctx_mem(file, -obj->size);
+}
+
/*
* Cache sync.. this is a bit over-complicated, to fit dma-mapping
* API. Really GPU cache is out of scope here (handled on cmdstream)
@@ -156,6 +187,8 @@ static struct page **get_pages(struct drm_gem_object *obj)
return p;
}
+ update_device_mem(dev->dev_private, obj->size);
+
msm_obj->pages = p;
msm_obj->sgt = drm_prime_pages_to_sg(obj->dev, p, npages);
@@ -209,6 +242,8 @@ static void put_pages(struct drm_gem_object *obj)
msm_obj->sgt = NULL;
}
+ update_device_mem(obj->dev->dev_private, -obj->size);
+
if (use_pages(obj))
drm_gem_put_pages(obj, msm_obj->pages, true, false);
else
@@ -1118,6 +1153,8 @@ static const struct vm_operations_struct vm_ops = {
static const struct drm_gem_object_funcs msm_gem_object_funcs = {
.free = msm_gem_free_object,
+ .open = msm_gem_open,
+ .close = msm_gem_close,
.pin = msm_gem_prime_pin,
.unpin = msm_gem_prime_unpin,
.get_sg_table = msm_gem_prime_get_sg_table,
@@ -428,6 +428,14 @@ struct msm_file_private {
* level.
*/
struct drm_sched_entity *entities[NR_SCHED_PRIORITIES * MSM_GPU_MAX_RINGS];
+
+ /**
+ * ctx_mem:
+ *
+ * Total amount of memory of GEM buffers with handles attached for
+ * this context.
+ */
+ atomic64_t ctx_mem;
};
/**