[5/6] drm/amdgpu: Log IBs and ring name at coredump
Commit Message
Log the IB addresses used by the hung job along with the stuck ring
name. Note that due to nested IBs, the one that caused the reset itself
may be in not listed address.
Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 +++++++++++++++++++++-
2 files changed, 33 insertions(+), 1 deletion(-)
@@ -1086,6 +1086,9 @@ struct amdgpu_coredump_info {
struct amdgpu_task_info reset_task_info;
struct timespec64 reset_time;
bool reset_vram_lost;
+ u64 *ibs;
+ u32 num_ibs;
+ char ring_name[16];
};
#endif
@@ -5008,12 +5008,24 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
coredump->adev->reset_dump_reg_value[i]);
}
+ if (coredump->num_ibs) {
+ drm_printf(&p, "IBs:\n");
+ for (i = 0; i < coredump->num_ibs; i++)
+ drm_printf(&p, "\t[%d] 0x%llx\n", i, coredump->ibs[i]);
+ }
+
+ if (coredump->ring_name[0] != '\0')
+ drm_printf(&p, "ring name: %s\n", coredump->ring_name);
+
return count - iter.remain;
}
static void amdgpu_devcoredump_free(void *data)
{
- kfree(data);
+ struct amdgpu_coredump_info *coredump = data;
+
+ kfree(coredump->ibs);
+ kfree(coredump);
}
static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
@@ -5021,6 +5033,8 @@ static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
{
struct amdgpu_coredump_info *coredump;
struct drm_device *dev = adev_to_drm(adev);
+ struct amdgpu_job *job = reset_context->job;
+ int i;
coredump = kmalloc(sizeof(*coredump), GFP_KERNEL);
@@ -5038,6 +5052,21 @@ static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
coredump->adev = adev;
+ if (job && job->num_ibs) {
+ struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
+ u32 num_ibs = job->num_ibs;
+
+ coredump->ibs = kmalloc_array(num_ibs, sizeof(coredump->ibs), GFP_KERNEL);
+ if (coredump->ibs)
+ coredump->num_ibs = num_ibs;
+
+ for (i = 0; i < coredump->num_ibs; i++)
+ coredump->ibs[i] = job->ibs[i].gpu_addr;
+
+ if (ring)
+ strncpy(coredump->ring_name, ring->name, 16);
+ }
+
ktime_get_ts64(&coredump->reset_time);
dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_KERNEL,