[RFC,v3,3/4] drm/amdgpu: Implement DRM_IOCTL_GET_RESET
Commit Message
Implement get_reset ioctl for amdgpu
Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 35 +++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 5 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 ++
6 files changed, 56 insertions(+), 3 deletions(-)
Comments
Am 21.06.23 um 02:57 schrieb André Almeida:
> Implement get_reset ioctl for amdgpu
Well that pretty much won't work since the jobs are destroyed much later
than the contexts.
Christian.
>
> Signed-off-by: André Almeida <andrealmeid@igalia.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 35 +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 5 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++++++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 ++
> 6 files changed, 56 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 2eb2c66843a8..0ba26b4b039c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1262,8 +1262,10 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
> uint64_t seq;
> int r;
>
> - for (i = 0; i < p->gang_size; ++i)
> + for (i = 0; i < p->gang_size; ++i) {
> + p->jobs[i]->ctx = p->ctx;
> drm_sched_job_arm(&p->jobs[i]->base);
> + }
>
> for (i = 0; i < p->gang_size; ++i) {
> struct dma_fence *fence;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index d2139ac12159..d3e292382d4a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -322,6 +322,9 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
> ctx->init_priority = priority;
> ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;
>
> + ctx->global_reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
> + ctx->local_reset_counter = 0;
> +
> r = amdgpu_ctx_get_stable_pstate(ctx, ¤t_stable_pstate);
> if (r)
> return r;
> @@ -963,3 +966,35 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
> }
> mutex_unlock(&mgr->lock);
> }
> +
> +int amdgpu_get_reset(struct drm_file *filp, struct drm_device *dev,
> + struct drm_get_reset *reset)
> +{
> + struct amdgpu_device *adev = drm_to_adev(dev);
> + struct amdgpu_ctx *ctx;
> + struct amdgpu_ctx_mgr *mgr;
> + unsigned int id = reset->ctx_id;
> + struct amdgpu_fpriv *fpriv = filp->driver_priv;
> +
> + mgr = &fpriv->ctx_mgr;
> + mutex_lock(&mgr->lock);
> + ctx = idr_find(&mgr->ctx_handles, id);
> + if (!ctx) {
> + mutex_unlock(&mgr->lock);
> + return -EINVAL;
> + }
> +
> + reset->dev_reset_count =
> + atomic_read(&adev->gpu_reset_counter) - ctx->global_reset_counter;
> +
> + reset->ctx_reset_count = ctx->local_reset_counter;
> +
> + if (amdgpu_in_reset(adev))
> + reset->flags |= DRM_RESET_IN_PROGRESS;
> +
> + if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
> + reset->flags |= DRM_RESET_VRAM_LOST;
> +
> + mutex_unlock(&mgr->lock);
> + return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index 0fa0e56daf67..0c9815695884 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -57,6 +57,9 @@ struct amdgpu_ctx {
> unsigned long ras_counter_ce;
> unsigned long ras_counter_ue;
> uint32_t stable_pstate;
> +
> + uint64_t global_reset_counter;
> + uint64_t local_reset_counter;
> };
>
> struct amdgpu_ctx_mgr {
> @@ -97,4 +100,6 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
> void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
> ktime_t usage[AMDGPU_HW_IP_NUM]);
>
> +int amdgpu_get_reset(struct drm_file *file_priv, struct drm_device *dev,
> + struct drm_get_reset *reset);
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index c9a41c997c6c..431791b2c3cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2805,6 +2805,7 @@ static const struct drm_driver amdgpu_kms_driver = {
> #ifdef CONFIG_PROC_FS
> .show_fdinfo = amdgpu_show_fdinfo,
> #endif
> + .get_reset = amdgpu_get_reset,
>
> .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
> .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index c3d9d75143f4..1553a2633d46 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -35,11 +35,20 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> {
> struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
> struct amdgpu_job *job = to_amdgpu_job(s_job);
> + struct drm_sched_entity *entity = job->base.entity;
> struct amdgpu_task_info ti;
> struct amdgpu_device *adev = ring->adev;
> int idx;
> int r;
>
> + memset(&ti, 0, sizeof(struct amdgpu_task_info));
> + amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
> +
> + if (job->ctx) {
> + DRM_INFO("Increasing ctx reset count for %s (%d)\n", ti.process_name, ti.pid);
> + job->ctx->local_reset_counter++;
> + }
> +
> if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
> DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
> __func__, s_job->sched->name);
> @@ -48,7 +57,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> return DRM_GPU_SCHED_STAT_ENODEV;
> }
>
> - memset(&ti, 0, sizeof(struct amdgpu_task_info));
> adev->job_hang = true;
>
> if (amdgpu_gpu_recovery &&
> @@ -58,7 +66,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> goto exit;
> }
>
> - amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
> DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
> job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
> ring->fence_drv.sync_seq);
> @@ -105,6 +112,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> */
> (*job)->base.sched = &adev->rings[0]->sched;
> (*job)->vm = vm;
> + (*job)->ctx = NULL;
>
> amdgpu_sync_create(&(*job)->explicit_sync);
> (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 52f2e313ea17..0d463babaa60 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -63,6 +63,8 @@ struct amdgpu_job {
> uint32_t oa_base, oa_size;
> uint32_t vram_lost_counter;
>
> + struct amdgpu_ctx *ctx;
> +
> /* user fence handling */
> uint64_t uf_addr;
> uint64_t uf_sequence;
Em 21/06/2023 04:40, Christian König escreveu:
> Am 21.06.23 um 02:57 schrieb André Almeida:
>> Implement get_reset ioctl for amdgpu
>
> Well that pretty much won't work since the jobs are destroyed much later
> than the contexts.
>
Why does this prevents the code to work? If the context is detroyed, it
can't be queried anyway.
> Christian.
>
>>
>> Signed-off-by: André Almeida <andrealmeid@igalia.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 35 +++++++++++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 5 ++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++++++--
>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 ++
>> 6 files changed, 56 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 2eb2c66843a8..0ba26b4b039c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -1262,8 +1262,10 @@ static int amdgpu_cs_submit(struct
>> amdgpu_cs_parser *p,
>> uint64_t seq;
>> int r;
>> - for (i = 0; i < p->gang_size; ++i)
>> + for (i = 0; i < p->gang_size; ++i) {
>> + p->jobs[i]->ctx = p->ctx;
>> drm_sched_job_arm(&p->jobs[i]->base);
>> + }
>> for (i = 0; i < p->gang_size; ++i) {
>> struct dma_fence *fence;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index d2139ac12159..d3e292382d4a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -322,6 +322,9 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr
>> *mgr, int32_t priority,
>> ctx->init_priority = priority;
>> ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;
>> + ctx->global_reset_counter =
>> atomic_read(&mgr->adev->gpu_reset_counter);
>> + ctx->local_reset_counter = 0;
>> +
>> r = amdgpu_ctx_get_stable_pstate(ctx, ¤t_stable_pstate);
>> if (r)
>> return r;
>> @@ -963,3 +966,35 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr
>> *mgr,
>> }
>> mutex_unlock(&mgr->lock);
>> }
>> +
>> +int amdgpu_get_reset(struct drm_file *filp, struct drm_device *dev,
>> + struct drm_get_reset *reset)
>> +{
>> + struct amdgpu_device *adev = drm_to_adev(dev);
>> + struct amdgpu_ctx *ctx;
>> + struct amdgpu_ctx_mgr *mgr;
>> + unsigned int id = reset->ctx_id;
>> + struct amdgpu_fpriv *fpriv = filp->driver_priv;
>> +
>> + mgr = &fpriv->ctx_mgr;
>> + mutex_lock(&mgr->lock);
>> + ctx = idr_find(&mgr->ctx_handles, id);
>> + if (!ctx) {
>> + mutex_unlock(&mgr->lock);
>> + return -EINVAL;
>> + }
>> +
>> + reset->dev_reset_count =
>> + atomic_read(&adev->gpu_reset_counter) -
>> ctx->global_reset_counter;
>> +
>> + reset->ctx_reset_count = ctx->local_reset_counter;
>> +
>> + if (amdgpu_in_reset(adev))
>> + reset->flags |= DRM_RESET_IN_PROGRESS;
>> +
>> + if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
>> + reset->flags |= DRM_RESET_VRAM_LOST;
>> +
>> + mutex_unlock(&mgr->lock);
>> + return 0;
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> index 0fa0e56daf67..0c9815695884 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> @@ -57,6 +57,9 @@ struct amdgpu_ctx {
>> unsigned long ras_counter_ce;
>> unsigned long ras_counter_ue;
>> uint32_t stable_pstate;
>> +
>> + uint64_t global_reset_counter;
>> + uint64_t local_reset_counter;
>> };
>> struct amdgpu_ctx_mgr {
>> @@ -97,4 +100,6 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
>> void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
>> ktime_t usage[AMDGPU_HW_IP_NUM]);
>> +int amdgpu_get_reset(struct drm_file *file_priv, struct drm_device *dev,
>> + struct drm_get_reset *reset);
>> #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index c9a41c997c6c..431791b2c3cb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -2805,6 +2805,7 @@ static const struct drm_driver amdgpu_kms_driver
>> = {
>> #ifdef CONFIG_PROC_FS
>> .show_fdinfo = amdgpu_show_fdinfo,
>> #endif
>> + .get_reset = amdgpu_get_reset,
>> .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
>> .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index c3d9d75143f4..1553a2633d46 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -35,11 +35,20 @@ static enum drm_gpu_sched_stat
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>> {
>> struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>> struct amdgpu_job *job = to_amdgpu_job(s_job);
>> + struct drm_sched_entity *entity = job->base.entity;
>> struct amdgpu_task_info ti;
>> struct amdgpu_device *adev = ring->adev;
>> int idx;
>> int r;
>> + memset(&ti, 0, sizeof(struct amdgpu_task_info));
>> + amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>> +
>> + if (job->ctx) {
>> + DRM_INFO("Increasing ctx reset count for %s (%d)\n",
>> ti.process_name, ti.pid);
>> + job->ctx->local_reset_counter++;
>> + }
>> +
>> if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
>> DRM_INFO("%s - device unplugged skipping recovery on
>> scheduler:%s",
>> __func__, s_job->sched->name);
>> @@ -48,7 +57,6 @@ static enum drm_gpu_sched_stat
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>> return DRM_GPU_SCHED_STAT_ENODEV;
>> }
>> - memset(&ti, 0, sizeof(struct amdgpu_task_info));
>> adev->job_hang = true;
>> if (amdgpu_gpu_recovery &&
>> @@ -58,7 +66,6 @@ static enum drm_gpu_sched_stat
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>> goto exit;
>> }
>> - amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>> DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>> job->base.sched->name,
>> atomic_read(&ring->fence_drv.last_seq),
>> ring->fence_drv.sync_seq);
>> @@ -105,6 +112,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev,
>> struct amdgpu_vm *vm,
>> */
>> (*job)->base.sched = &adev->rings[0]->sched;
>> (*job)->vm = vm;
>> + (*job)->ctx = NULL;
>> amdgpu_sync_create(&(*job)->explicit_sync);
>> (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> index 52f2e313ea17..0d463babaa60 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> @@ -63,6 +63,8 @@ struct amdgpu_job {
>> uint32_t oa_base, oa_size;
>> uint32_t vram_lost_counter;
>> + struct amdgpu_ctx *ctx;
>> +
>> /* user fence handling */
>> uint64_t uf_addr;
>> uint64_t uf_sequence;
>
Am 21.06.23 um 18:38 schrieb André Almeida:
> Em 21/06/2023 04:40, Christian König escreveu:
>> Am 21.06.23 um 02:57 schrieb André Almeida:
>>> Implement get_reset ioctl for amdgpu
>>
>> Well that pretty much won't work since the jobs are destroyed much
>> later than the contexts.
>>
>
> Why does this prevents the code to work? If the context is detroyed,
> it can't be queried anyway.
Yeah, but you cause use after free issues with that!
The references are ctx->entit->job->fence, so that ctx and entity can be
destroyed first without destroying the job or fence.
If the job has a back reference that whole stuff doesn't work any more
and the pointer is potentially dangling.
Christian.
>
>> Christian.
>>
>>>
>>> Signed-off-by: André Almeida <andrealmeid@igalia.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 35
>>> +++++++++++++++++++++++++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 5 ++++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++++++--
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 ++
>>> 6 files changed, 56 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 2eb2c66843a8..0ba26b4b039c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -1262,8 +1262,10 @@ static int amdgpu_cs_submit(struct
>>> amdgpu_cs_parser *p,
>>> uint64_t seq;
>>> int r;
>>> - for (i = 0; i < p->gang_size; ++i)
>>> + for (i = 0; i < p->gang_size; ++i) {
>>> + p->jobs[i]->ctx = p->ctx;
>>> drm_sched_job_arm(&p->jobs[i]->base);
>>> + }
>>> for (i = 0; i < p->gang_size; ++i) {
>>> struct dma_fence *fence;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> index d2139ac12159..d3e292382d4a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> @@ -322,6 +322,9 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr
>>> *mgr, int32_t priority,
>>> ctx->init_priority = priority;
>>> ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;
>>> + ctx->global_reset_counter =
>>> atomic_read(&mgr->adev->gpu_reset_counter);
>>> + ctx->local_reset_counter = 0;
>>> +
>>> r = amdgpu_ctx_get_stable_pstate(ctx, ¤t_stable_pstate);
>>> if (r)
>>> return r;
>>> @@ -963,3 +966,35 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr
>>> *mgr,
>>> }
>>> mutex_unlock(&mgr->lock);
>>> }
>>> +
>>> +int amdgpu_get_reset(struct drm_file *filp, struct drm_device *dev,
>>> + struct drm_get_reset *reset)
>>> +{
>>> + struct amdgpu_device *adev = drm_to_adev(dev);
>>> + struct amdgpu_ctx *ctx;
>>> + struct amdgpu_ctx_mgr *mgr;
>>> + unsigned int id = reset->ctx_id;
>>> + struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>> +
>>> + mgr = &fpriv->ctx_mgr;
>>> + mutex_lock(&mgr->lock);
>>> + ctx = idr_find(&mgr->ctx_handles, id);
>>> + if (!ctx) {
>>> + mutex_unlock(&mgr->lock);
>>> + return -EINVAL;
>>> + }
>>> +
>>> + reset->dev_reset_count =
>>> + atomic_read(&adev->gpu_reset_counter) -
>>> ctx->global_reset_counter;
>>> +
>>> + reset->ctx_reset_count = ctx->local_reset_counter;
>>> +
>>> + if (amdgpu_in_reset(adev))
>>> + reset->flags |= DRM_RESET_IN_PROGRESS;
>>> +
>>> + if (ctx->vram_lost_counter !=
>>> atomic_read(&adev->vram_lost_counter))
>>> + reset->flags |= DRM_RESET_VRAM_LOST;
>>> +
>>> + mutex_unlock(&mgr->lock);
>>> + return 0;
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>>> index 0fa0e56daf67..0c9815695884 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>>> @@ -57,6 +57,9 @@ struct amdgpu_ctx {
>>> unsigned long ras_counter_ce;
>>> unsigned long ras_counter_ue;
>>> uint32_t stable_pstate;
>>> +
>>> + uint64_t global_reset_counter;
>>> + uint64_t local_reset_counter;
>>> };
>>> struct amdgpu_ctx_mgr {
>>> @@ -97,4 +100,6 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr
>>> *mgr);
>>> void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
>>> ktime_t usage[AMDGPU_HW_IP_NUM]);
>>> +int amdgpu_get_reset(struct drm_file *file_priv, struct drm_device
>>> *dev,
>>> + struct drm_get_reset *reset);
>>> #endif
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index c9a41c997c6c..431791b2c3cb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -2805,6 +2805,7 @@ static const struct drm_driver
>>> amdgpu_kms_driver = {
>>> #ifdef CONFIG_PROC_FS
>>> .show_fdinfo = amdgpu_show_fdinfo,
>>> #endif
>>> + .get_reset = amdgpu_get_reset,
>>> .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
>>> .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index c3d9d75143f4..1553a2633d46 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -35,11 +35,20 @@ static enum drm_gpu_sched_stat
>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>> {
>>> struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>>> struct amdgpu_job *job = to_amdgpu_job(s_job);
>>> + struct drm_sched_entity *entity = job->base.entity;
>>> struct amdgpu_task_info ti;
>>> struct amdgpu_device *adev = ring->adev;
>>> int idx;
>>> int r;
>>> + memset(&ti, 0, sizeof(struct amdgpu_task_info));
>>> + amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>>> +
>>> + if (job->ctx) {
>>> + DRM_INFO("Increasing ctx reset count for %s (%d)\n",
>>> ti.process_name, ti.pid);
>>> + job->ctx->local_reset_counter++;
>>> + }
>>> +
>>> if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
>>> DRM_INFO("%s - device unplugged skipping recovery on
>>> scheduler:%s",
>>> __func__, s_job->sched->name);
>>> @@ -48,7 +57,6 @@ static enum drm_gpu_sched_stat
>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>> return DRM_GPU_SCHED_STAT_ENODEV;
>>> }
>>> - memset(&ti, 0, sizeof(struct amdgpu_task_info));
>>> adev->job_hang = true;
>>> if (amdgpu_gpu_recovery &&
>>> @@ -58,7 +66,6 @@ static enum drm_gpu_sched_stat
>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>> goto exit;
>>> }
>>> - amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>>> DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>>> job->base.sched->name,
>>> atomic_read(&ring->fence_drv.last_seq),
>>> ring->fence_drv.sync_seq);
>>> @@ -105,6 +112,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev,
>>> struct amdgpu_vm *vm,
>>> */
>>> (*job)->base.sched = &adev->rings[0]->sched;
>>> (*job)->vm = vm;
>>> + (*job)->ctx = NULL;
>>> amdgpu_sync_create(&(*job)->explicit_sync);
>>> (*job)->vram_lost_counter =
>>> atomic_read(&adev->vram_lost_counter);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index 52f2e313ea17..0d463babaa60 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -63,6 +63,8 @@ struct amdgpu_job {
>>> uint32_t oa_base, oa_size;
>>> uint32_t vram_lost_counter;
>>> + struct amdgpu_ctx *ctx;
>>> +
>>> /* user fence handling */
>>> uint64_t uf_addr;
>>> uint64_t uf_sequence;
>>
@@ -1262,8 +1262,10 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
uint64_t seq;
int r;
- for (i = 0; i < p->gang_size; ++i)
+ for (i = 0; i < p->gang_size; ++i) {
+ p->jobs[i]->ctx = p->ctx;
drm_sched_job_arm(&p->jobs[i]->base);
+ }
for (i = 0; i < p->gang_size; ++i) {
struct dma_fence *fence;
@@ -322,6 +322,9 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
ctx->init_priority = priority;
ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;
+ ctx->global_reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
+ ctx->local_reset_counter = 0;
+
r = amdgpu_ctx_get_stable_pstate(ctx, ¤t_stable_pstate);
if (r)
return r;
@@ -963,3 +966,35 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
}
mutex_unlock(&mgr->lock);
}
+
+int amdgpu_get_reset(struct drm_file *filp, struct drm_device *dev,
+ struct drm_get_reset *reset)
+{
+ struct amdgpu_device *adev = drm_to_adev(dev);
+ struct amdgpu_ctx *ctx;
+ struct amdgpu_ctx_mgr *mgr;
+ unsigned int id = reset->ctx_id;
+ struct amdgpu_fpriv *fpriv = filp->driver_priv;
+
+ mgr = &fpriv->ctx_mgr;
+ mutex_lock(&mgr->lock);
+ ctx = idr_find(&mgr->ctx_handles, id);
+ if (!ctx) {
+ mutex_unlock(&mgr->lock);
+ return -EINVAL;
+ }
+
+ reset->dev_reset_count =
+ atomic_read(&adev->gpu_reset_counter) - ctx->global_reset_counter;
+
+ reset->ctx_reset_count = ctx->local_reset_counter;
+
+ if (amdgpu_in_reset(adev))
+ reset->flags |= DRM_RESET_IN_PROGRESS;
+
+ if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
+ reset->flags |= DRM_RESET_VRAM_LOST;
+
+ mutex_unlock(&mgr->lock);
+ return 0;
+}
@@ -57,6 +57,9 @@ struct amdgpu_ctx {
unsigned long ras_counter_ce;
unsigned long ras_counter_ue;
uint32_t stable_pstate;
+
+ uint64_t global_reset_counter;
+ uint64_t local_reset_counter;
};
struct amdgpu_ctx_mgr {
@@ -97,4 +100,6 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
ktime_t usage[AMDGPU_HW_IP_NUM]);
+int amdgpu_get_reset(struct drm_file *file_priv, struct drm_device *dev,
+ struct drm_get_reset *reset);
#endif
@@ -2805,6 +2805,7 @@ static const struct drm_driver amdgpu_kms_driver = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = amdgpu_show_fdinfo,
#endif
+ .get_reset = amdgpu_get_reset,
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
@@ -35,11 +35,20 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
{
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
+ struct drm_sched_entity *entity = job->base.entity;
struct amdgpu_task_info ti;
struct amdgpu_device *adev = ring->adev;
int idx;
int r;
+ memset(&ti, 0, sizeof(struct amdgpu_task_info));
+ amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
+
+ if (job->ctx) {
+ DRM_INFO("Increasing ctx reset count for %s (%d)\n", ti.process_name, ti.pid);
+ job->ctx->local_reset_counter++;
+ }
+
if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
__func__, s_job->sched->name);
@@ -48,7 +57,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
return DRM_GPU_SCHED_STAT_ENODEV;
}
- memset(&ti, 0, sizeof(struct amdgpu_task_info));
adev->job_hang = true;
if (amdgpu_gpu_recovery &&
@@ -58,7 +66,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
goto exit;
}
- amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
ring->fence_drv.sync_seq);
@@ -105,6 +112,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
*/
(*job)->base.sched = &adev->rings[0]->sched;
(*job)->vm = vm;
+ (*job)->ctx = NULL;
amdgpu_sync_create(&(*job)->explicit_sync);
(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
@@ -63,6 +63,8 @@ struct amdgpu_job {
uint32_t oa_base, oa_size;
uint32_t vram_lost_counter;
+ struct amdgpu_ctx *ctx;
+
/* user fence handling */
uint64_t uf_addr;
uint64_t uf_sequence;