public inbox for drm-ai-reviews@public-inbox.freedesktop.org
 help / color / mirror / Atom feed
* [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic
@ 2026-03-09 10:30 Boris Brezillon
  2026-03-09 10:50 ` Christian König
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Boris Brezillon @ 2026-03-09 10:30 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, kernel, Nicolas Frattaroli, Tvrtko Ursulin,
	Philipp Stanner, Christian König

After commit 541c8f2468b9 ("dma-buf: detach fence ops on signal v3"),
dma_fence::ops == NULL can't be used to check if the fence is initialized
or not. We could turn this into an "is_signaled() || ops == NULL" test,
but that's fragile, since it's still subject to dma_fence internal
changes. So let's have the "is_initialized" state encoded directly in
the pointer through the lowest bit which is guaranteed to be unused
because of the dma_fence alignment constraint.

Cc: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Cc: Philipp Stanner <phasta@kernel.org>
Cc: Christian König <christian.koenig@amd.com>
Reported-by: Steven Price <steven.price@arm.com>
Reported-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
Fixes: 541c8f2468b9 ("dma-buf: detach fence ops on signal v3")
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_sched.c | 69 ++++++++++++++++++-------
 1 file changed, 50 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index bd703a2904a1..31589add86f5 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -835,8 +835,15 @@ struct panthor_job {
 	 */
 	struct list_head node;
 
-	/** @done_fence: Fence signaled when the job is finished or cancelled. */
-	struct dma_fence *done_fence;
+	/**
+	 * @done_fence: Fence signaled when the job is finished or cancelled.
+	 *
+	 * This is a uintptr_t because we use the lower bit to encode whether
+	 * the fence has been initialized or not, and we don't want code to dereference
+	 * this field directly (job_done_fence()/job_done_fence_initialized() should be used
+	 * instead).
+	 */
+	uintptr_t done_fence;
 
 	/** @profiling: Job profiling information. */
 	struct {
@@ -1518,6 +1525,18 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
 		 info);
 }
 
+#define DONE_FENCE_INITIALIZED BIT(0)
+
+static struct dma_fence *job_done_fence(struct panthor_job *job)
+{
+	return (void *)(job->done_fence & ~(uintptr_t)DONE_FENCE_INITIALIZED);
+}
+
+static bool job_done_fence_initialized(struct panthor_job *job)
+{
+	return job->done_fence & DONE_FENCE_INITIALIZED;
+}
+
 static void
 cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
 				   u32 csg_id, u32 cs_id)
@@ -1549,7 +1568,7 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
 			if (cs_extract < job->ringbuf.start)
 				break;
 
-			dma_fence_set_error(job->done_fence, -EINVAL);
+			dma_fence_set_error(job_done_fence(job), -EINVAL);
 		}
 		spin_unlock(&queue->fence_ctx.lock);
 	}
@@ -2182,9 +2201,11 @@ group_term_post_processing(struct panthor_group *group)
 
 		spin_lock(&queue->fence_ctx.lock);
 		list_for_each_entry_safe(job, tmp, &queue->fence_ctx.in_flight_jobs, node) {
+			struct dma_fence *done_fence = job_done_fence(job);
+
 			list_move_tail(&job->node, &faulty_jobs);
-			dma_fence_set_error(job->done_fence, err);
-			dma_fence_signal_locked(job->done_fence);
+			dma_fence_set_error(done_fence, err);
+			dma_fence_signal_locked(done_fence);
 		}
 		spin_unlock(&queue->fence_ctx.lock);
 
@@ -2734,7 +2755,7 @@ static void queue_start(struct panthor_queue *queue)
 
 	/* Re-assign the parent fences. */
 	list_for_each_entry(job, &queue->scheduler.pending_list, base.list)
-		job->base.s_fence->parent = dma_fence_get(job->done_fence);
+		job->base.s_fence->parent = dma_fence_get(job_done_fence(job));
 
 	enable_delayed_work(&queue->timeout.work);
 	drm_sched_start(&queue->scheduler, 0);
@@ -3047,6 +3068,8 @@ static bool queue_check_job_completion(struct panthor_queue *queue)
 	cookie = dma_fence_begin_signalling();
 	spin_lock(&queue->fence_ctx.lock);
 	list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
+		struct dma_fence *done_fence = job_done_fence(job);
+
 		if (!syncobj) {
 			struct panthor_group *group = job->group;
 
@@ -3054,11 +3077,11 @@ static bool queue_check_job_completion(struct panthor_queue *queue)
 				  (job->queue_idx * sizeof(*syncobj));
 		}
 
-		if (syncobj->seqno < job->done_fence->seqno)
+		if (syncobj->seqno < done_fence->seqno)
 			break;
 
 		list_move_tail(&job->node, &done_jobs);
-		dma_fence_signal_locked(job->done_fence);
+		dma_fence_signal_locked(done_fence);
 	}
 
 	if (list_empty(&queue->fence_ctx.in_flight_jobs)) {
@@ -3309,8 +3332,10 @@ queue_run_job(struct drm_sched_job *sched_job)
 	 * drm_sched_job::s_fence::finished fence.
 	 */
 	if (!job->call_info.size) {
-		job->done_fence = dma_fence_get(queue->fence_ctx.last_fence);
-		return dma_fence_get(job->done_fence);
+		done_fence = dma_fence_get(queue->fence_ctx.last_fence);
+
+		job->done_fence = (uintptr_t)done_fence | DONE_FENCE_INITIALIZED;
+		return dma_fence_get(done_fence);
 	}
 
 	ret = panthor_device_resume_and_get(ptdev);
@@ -3323,11 +3348,13 @@ queue_run_job(struct drm_sched_job *sched_job)
 		goto out_unlock;
 	}
 
-	dma_fence_init(job->done_fence,
+	done_fence = job_done_fence(job);
+	dma_fence_init(done_fence,
 		       &panthor_queue_fence_ops,
 		       &queue->fence_ctx.lock,
 		       queue->fence_ctx.id,
 		       atomic64_inc_return(&queue->fence_ctx.seqno));
+	job->done_fence |= DONE_FENCE_INITIALIZED;
 
 	job->profiling.slot = queue->profiling.seqno++;
 	if (queue->profiling.seqno == queue->profiling.slot_count)
@@ -3381,9 +3408,9 @@ queue_run_job(struct drm_sched_job *sched_job)
 
 	/* Update the last fence. */
 	dma_fence_put(queue->fence_ctx.last_fence);
-	queue->fence_ctx.last_fence = dma_fence_get(job->done_fence);
+	queue->fence_ctx.last_fence = dma_fence_get(done_fence);
 
-	done_fence = dma_fence_get(job->done_fence);
+	done_fence = dma_fence_get(done_fence);
 
 out_unlock:
 	mutex_unlock(&sched->lock);
@@ -3403,7 +3430,7 @@ queue_timedout_job(struct drm_sched_job *sched_job)
 	struct panthor_queue *queue = group->queues[job->queue_idx];
 
 	drm_warn(&ptdev->base, "job timeout: pid=%d, comm=%s, seqno=%llu\n",
-		 group->task_info.pid, group->task_info.comm, job->done_fence->seqno);
+		 group->task_info.pid, group->task_info.comm, job_done_fence(job)->seqno);
 
 	drm_WARN_ON(&ptdev->base, atomic_read(&sched->reset.in_progress));
 
@@ -3915,10 +3942,10 @@ static void job_release(struct kref *ref)
 	if (job->base.s_fence)
 		drm_sched_job_cleanup(&job->base);
 
-	if (job->done_fence && job->done_fence->ops)
-		dma_fence_put(job->done_fence);
+	if (job_done_fence_initialized(job))
+		dma_fence_put(job_done_fence(job));
 	else
-		dma_fence_free(job->done_fence);
+		dma_fence_free(job_done_fence(job));
 
 	group_put(job->group);
 
@@ -4011,11 +4038,15 @@ panthor_job_create(struct panthor_file *pfile,
 	 * the previously submitted job.
 	 */
 	if (job->call_info.size) {
-		job->done_fence = kzalloc_obj(*job->done_fence);
-		if (!job->done_fence) {
+		struct dma_fence *done_fence;
+
+		done_fence = kzalloc_obj(*done_fence);
+		if (!done_fence) {
 			ret = -ENOMEM;
 			goto err_put_job;
 		}
+
+		job->done_fence = (uintptr_t)done_fence;
 	}
 
 	job->profiling.mask = pfile->ptdev->profile_mask;
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic
  2026-03-09 10:30 [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic Boris Brezillon
@ 2026-03-09 10:50 ` Christian König
  2026-03-09 11:06   ` Boris Brezillon
  2026-03-09 11:05 ` Liviu Dudau
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Christian König @ 2026-03-09 10:50 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, kernel, Nicolas Frattaroli, Tvrtko Ursulin,
	Philipp Stanner

On 3/9/26 11:30, Boris Brezillon wrote:
> After commit 541c8f2468b9 ("dma-buf: detach fence ops on signal v3"),
> dma_fence::ops == NULL can't be used to check if the fence is initialized
> or not. We could turn this into an "is_signaled() || ops == NULL" test,
> but that's fragile, since it's still subject to dma_fence internal
> changes. So let's have the "is_initialized" state encoded directly in
> the pointer through the lowest bit which is guaranteed to be unused
> because of the dma_fence alignment constraint.

We added the dma_fence_was_initialized() function exactly for this use case and fixed all callers.

But looks like that occasion here somehow felt through the cracks, no idea why my script for NULL checks of dma_fence->ops didn't triggered here.

Regards,
Christian.

> 
> Cc: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> Cc: Philipp Stanner <phasta@kernel.org>
> Cc: Christian König <christian.koenig@amd.com>
> Reported-by: Steven Price <steven.price@arm.com>
> Reported-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
> Fixes: 541c8f2468b9 ("dma-buf: detach fence ops on signal v3")
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 69 ++++++++++++++++++-------
>  1 file changed, 50 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index bd703a2904a1..31589add86f5 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -835,8 +835,15 @@ struct panthor_job {
>  	 */
>  	struct list_head node;
>  
> -	/** @done_fence: Fence signaled when the job is finished or cancelled. */
> -	struct dma_fence *done_fence;
> +	/**
> +	 * @done_fence: Fence signaled when the job is finished or cancelled.
> +	 *
> +	 * This is a uintptr_t because we use the lower bit to encode whether
> +	 * the fence has been initialized or not, and we don't want code to dereference
> +	 * this field directly (job_done_fence()/job_done_fence_initialized() should be used
> +	 * instead).
> +	 */
> +	uintptr_t done_fence;
>  
>  	/** @profiling: Job profiling information. */
>  	struct {
> @@ -1518,6 +1525,18 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
>  		 info);
>  }
>  
> +#define DONE_FENCE_INITIALIZED BIT(0)
> +
> +static struct dma_fence *job_done_fence(struct panthor_job *job)
> +{
> +	return (void *)(job->done_fence & ~(uintptr_t)DONE_FENCE_INITIALIZED);
> +}
> +
> +static bool job_done_fence_initialized(struct panthor_job *job)
> +{
> +	return job->done_fence & DONE_FENCE_INITIALIZED;
> +}
> +
>  static void
>  cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  				   u32 csg_id, u32 cs_id)
> @@ -1549,7 +1568,7 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  			if (cs_extract < job->ringbuf.start)
>  				break;
>  
> -			dma_fence_set_error(job->done_fence, -EINVAL);
> +			dma_fence_set_error(job_done_fence(job), -EINVAL);
>  		}
>  		spin_unlock(&queue->fence_ctx.lock);
>  	}
> @@ -2182,9 +2201,11 @@ group_term_post_processing(struct panthor_group *group)
>  
>  		spin_lock(&queue->fence_ctx.lock);
>  		list_for_each_entry_safe(job, tmp, &queue->fence_ctx.in_flight_jobs, node) {
> +			struct dma_fence *done_fence = job_done_fence(job);
> +
>  			list_move_tail(&job->node, &faulty_jobs);
> -			dma_fence_set_error(job->done_fence, err);
> -			dma_fence_signal_locked(job->done_fence);
> +			dma_fence_set_error(done_fence, err);
> +			dma_fence_signal_locked(done_fence);
>  		}
>  		spin_unlock(&queue->fence_ctx.lock);
>  
> @@ -2734,7 +2755,7 @@ static void queue_start(struct panthor_queue *queue)
>  
>  	/* Re-assign the parent fences. */
>  	list_for_each_entry(job, &queue->scheduler.pending_list, base.list)
> -		job->base.s_fence->parent = dma_fence_get(job->done_fence);
> +		job->base.s_fence->parent = dma_fence_get(job_done_fence(job));
>  
>  	enable_delayed_work(&queue->timeout.work);
>  	drm_sched_start(&queue->scheduler, 0);
> @@ -3047,6 +3068,8 @@ static bool queue_check_job_completion(struct panthor_queue *queue)
>  	cookie = dma_fence_begin_signalling();
>  	spin_lock(&queue->fence_ctx.lock);
>  	list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
> +		struct dma_fence *done_fence = job_done_fence(job);
> +
>  		if (!syncobj) {
>  			struct panthor_group *group = job->group;
>  
> @@ -3054,11 +3077,11 @@ static bool queue_check_job_completion(struct panthor_queue *queue)
>  				  (job->queue_idx * sizeof(*syncobj));
>  		}
>  
> -		if (syncobj->seqno < job->done_fence->seqno)
> +		if (syncobj->seqno < done_fence->seqno)
>  			break;
>  
>  		list_move_tail(&job->node, &done_jobs);
> -		dma_fence_signal_locked(job->done_fence);
> +		dma_fence_signal_locked(done_fence);
>  	}
>  
>  	if (list_empty(&queue->fence_ctx.in_flight_jobs)) {
> @@ -3309,8 +3332,10 @@ queue_run_job(struct drm_sched_job *sched_job)
>  	 * drm_sched_job::s_fence::finished fence.
>  	 */
>  	if (!job->call_info.size) {
> -		job->done_fence = dma_fence_get(queue->fence_ctx.last_fence);
> -		return dma_fence_get(job->done_fence);
> +		done_fence = dma_fence_get(queue->fence_ctx.last_fence);
> +
> +		job->done_fence = (uintptr_t)done_fence | DONE_FENCE_INITIALIZED;
> +		return dma_fence_get(done_fence);
>  	}
>  
>  	ret = panthor_device_resume_and_get(ptdev);
> @@ -3323,11 +3348,13 @@ queue_run_job(struct drm_sched_job *sched_job)
>  		goto out_unlock;
>  	}
>  
> -	dma_fence_init(job->done_fence,
> +	done_fence = job_done_fence(job);
> +	dma_fence_init(done_fence,
>  		       &panthor_queue_fence_ops,
>  		       &queue->fence_ctx.lock,
>  		       queue->fence_ctx.id,
>  		       atomic64_inc_return(&queue->fence_ctx.seqno));
> +	job->done_fence |= DONE_FENCE_INITIALIZED;
>  
>  	job->profiling.slot = queue->profiling.seqno++;
>  	if (queue->profiling.seqno == queue->profiling.slot_count)
> @@ -3381,9 +3408,9 @@ queue_run_job(struct drm_sched_job *sched_job)
>  
>  	/* Update the last fence. */
>  	dma_fence_put(queue->fence_ctx.last_fence);
> -	queue->fence_ctx.last_fence = dma_fence_get(job->done_fence);
> +	queue->fence_ctx.last_fence = dma_fence_get(done_fence);
>  
> -	done_fence = dma_fence_get(job->done_fence);
> +	done_fence = dma_fence_get(done_fence);
>  
>  out_unlock:
>  	mutex_unlock(&sched->lock);
> @@ -3403,7 +3430,7 @@ queue_timedout_job(struct drm_sched_job *sched_job)
>  	struct panthor_queue *queue = group->queues[job->queue_idx];
>  
>  	drm_warn(&ptdev->base, "job timeout: pid=%d, comm=%s, seqno=%llu\n",
> -		 group->task_info.pid, group->task_info.comm, job->done_fence->seqno);
> +		 group->task_info.pid, group->task_info.comm, job_done_fence(job)->seqno);
>  
>  	drm_WARN_ON(&ptdev->base, atomic_read(&sched->reset.in_progress));
>  
> @@ -3915,10 +3942,10 @@ static void job_release(struct kref *ref)
>  	if (job->base.s_fence)
>  		drm_sched_job_cleanup(&job->base);
>  
> -	if (job->done_fence && job->done_fence->ops)
> -		dma_fence_put(job->done_fence);
> +	if (job_done_fence_initialized(job))
> +		dma_fence_put(job_done_fence(job));
>  	else
> -		dma_fence_free(job->done_fence);
> +		dma_fence_free(job_done_fence(job));
>  
>  	group_put(job->group);
>  
> @@ -4011,11 +4038,15 @@ panthor_job_create(struct panthor_file *pfile,
>  	 * the previously submitted job.
>  	 */
>  	if (job->call_info.size) {
> -		job->done_fence = kzalloc_obj(*job->done_fence);
> -		if (!job->done_fence) {
> +		struct dma_fence *done_fence;
> +
> +		done_fence = kzalloc_obj(*done_fence);
> +		if (!done_fence) {
>  			ret = -ENOMEM;
>  			goto err_put_job;
>  		}
> +
> +		job->done_fence = (uintptr_t)done_fence;
>  	}
>  
>  	job->profiling.mask = pfile->ptdev->profile_mask;


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic
  2026-03-09 10:30 [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic Boris Brezillon
  2026-03-09 10:50 ` Christian König
@ 2026-03-09 11:05 ` Liviu Dudau
  2026-03-09 13:15   ` Boris Brezillon
  2026-03-09 11:06 ` Nicolas Frattaroli
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Liviu Dudau @ 2026-03-09 11:05 UTC (permalink / raw)
  To: Boris Brezillon
  Cc: Steven Price, Adrián Larumbe, dri-devel, kernel,
	Nicolas Frattaroli, Tvrtko Ursulin, Philipp Stanner,
	Christian König

On Mon, Mar 09, 2026 at 11:30:53AM +0100, Boris Brezillon wrote:
> After commit 541c8f2468b9 ("dma-buf: detach fence ops on signal v3"),
> dma_fence::ops == NULL can't be used to check if the fence is initialized
> or not. We could turn this into an "is_signaled() || ops == NULL" test,
> but that's fragile, since it's still subject to dma_fence internal
> changes. So let's have the "is_initialized" state encoded directly in
> the pointer through the lowest bit which is guaranteed to be unused
> because of the dma_fence alignment constraint.

I'm confused! There is only one place where we end up being interested if the
fence has been initialized or not, and that is in job_release(). I don't
see why checking for "ops != NULL" before calling dma_fence_put() should not
be enough, or even better, why don't we call dma_fence_put() regardless,
as the core code should take care of an uninitialized dma_fence AFAICT.

Best regards,
Liviu

> 
> Cc: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> Cc: Philipp Stanner <phasta@kernel.org>
> Cc: Christian König <christian.koenig@amd.com>
> Reported-by: Steven Price <steven.price@arm.com>
> Reported-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
> Fixes: 541c8f2468b9 ("dma-buf: detach fence ops on signal v3")
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 69 ++++++++++++++++++-------
>  1 file changed, 50 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index bd703a2904a1..31589add86f5 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -835,8 +835,15 @@ struct panthor_job {
>  	 */
>  	struct list_head node;
>  
> -	/** @done_fence: Fence signaled when the job is finished or cancelled. */
> -	struct dma_fence *done_fence;
> +	/**
> +	 * @done_fence: Fence signaled when the job is finished or cancelled.
> +	 *
> +	 * This is a uintptr_t because we use the lower bit to encode whether
> +	 * the fence has been initialized or not, and we don't want code to dereference
> +	 * this field directly (job_done_fence()/job_done_fence_initialized() should be used
> +	 * instead).
> +	 */
> +	uintptr_t done_fence;
>  
>  	/** @profiling: Job profiling information. */
>  	struct {
> @@ -1518,6 +1525,18 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
>  		 info);
>  }
>  
> +#define DONE_FENCE_INITIALIZED BIT(0)
> +
> +static struct dma_fence *job_done_fence(struct panthor_job *job)
> +{
> +	return (void *)(job->done_fence & ~(uintptr_t)DONE_FENCE_INITIALIZED);
> +}
> +
> +static bool job_done_fence_initialized(struct panthor_job *job)
> +{
> +	return job->done_fence & DONE_FENCE_INITIALIZED;
> +}
> +
>  static void
>  cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  				   u32 csg_id, u32 cs_id)
> @@ -1549,7 +1568,7 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  			if (cs_extract < job->ringbuf.start)
>  				break;
>  
> -			dma_fence_set_error(job->done_fence, -EINVAL);
> +			dma_fence_set_error(job_done_fence(job), -EINVAL);
>  		}
>  		spin_unlock(&queue->fence_ctx.lock);
>  	}
> @@ -2182,9 +2201,11 @@ group_term_post_processing(struct panthor_group *group)
>  
>  		spin_lock(&queue->fence_ctx.lock);
>  		list_for_each_entry_safe(job, tmp, &queue->fence_ctx.in_flight_jobs, node) {
> +			struct dma_fence *done_fence = job_done_fence(job);
> +
>  			list_move_tail(&job->node, &faulty_jobs);
> -			dma_fence_set_error(job->done_fence, err);
> -			dma_fence_signal_locked(job->done_fence);
> +			dma_fence_set_error(done_fence, err);
> +			dma_fence_signal_locked(done_fence);
>  		}
>  		spin_unlock(&queue->fence_ctx.lock);
>  
> @@ -2734,7 +2755,7 @@ static void queue_start(struct panthor_queue *queue)
>  
>  	/* Re-assign the parent fences. */
>  	list_for_each_entry(job, &queue->scheduler.pending_list, base.list)
> -		job->base.s_fence->parent = dma_fence_get(job->done_fence);
> +		job->base.s_fence->parent = dma_fence_get(job_done_fence(job));
>  
>  	enable_delayed_work(&queue->timeout.work);
>  	drm_sched_start(&queue->scheduler, 0);
> @@ -3047,6 +3068,8 @@ static bool queue_check_job_completion(struct panthor_queue *queue)
>  	cookie = dma_fence_begin_signalling();
>  	spin_lock(&queue->fence_ctx.lock);
>  	list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
> +		struct dma_fence *done_fence = job_done_fence(job);
> +
>  		if (!syncobj) {
>  			struct panthor_group *group = job->group;
>  
> @@ -3054,11 +3077,11 @@ static bool queue_check_job_completion(struct panthor_queue *queue)
>  				  (job->queue_idx * sizeof(*syncobj));
>  		}
>  
> -		if (syncobj->seqno < job->done_fence->seqno)
> +		if (syncobj->seqno < done_fence->seqno)
>  			break;
>  
>  		list_move_tail(&job->node, &done_jobs);
> -		dma_fence_signal_locked(job->done_fence);
> +		dma_fence_signal_locked(done_fence);
>  	}
>  
>  	if (list_empty(&queue->fence_ctx.in_flight_jobs)) {
> @@ -3309,8 +3332,10 @@ queue_run_job(struct drm_sched_job *sched_job)
>  	 * drm_sched_job::s_fence::finished fence.
>  	 */
>  	if (!job->call_info.size) {
> -		job->done_fence = dma_fence_get(queue->fence_ctx.last_fence);
> -		return dma_fence_get(job->done_fence);
> +		done_fence = dma_fence_get(queue->fence_ctx.last_fence);
> +
> +		job->done_fence = (uintptr_t)done_fence | DONE_FENCE_INITIALIZED;
> +		return dma_fence_get(done_fence);
>  	}
>  
>  	ret = panthor_device_resume_and_get(ptdev);
> @@ -3323,11 +3348,13 @@ queue_run_job(struct drm_sched_job *sched_job)
>  		goto out_unlock;
>  	}
>  
> -	dma_fence_init(job->done_fence,
> +	done_fence = job_done_fence(job);
> +	dma_fence_init(done_fence,
>  		       &panthor_queue_fence_ops,
>  		       &queue->fence_ctx.lock,
>  		       queue->fence_ctx.id,
>  		       atomic64_inc_return(&queue->fence_ctx.seqno));
> +	job->done_fence |= DONE_FENCE_INITIALIZED;
>  
>  	job->profiling.slot = queue->profiling.seqno++;
>  	if (queue->profiling.seqno == queue->profiling.slot_count)
> @@ -3381,9 +3408,9 @@ queue_run_job(struct drm_sched_job *sched_job)
>  
>  	/* Update the last fence. */
>  	dma_fence_put(queue->fence_ctx.last_fence);
> -	queue->fence_ctx.last_fence = dma_fence_get(job->done_fence);
> +	queue->fence_ctx.last_fence = dma_fence_get(done_fence);
>  
> -	done_fence = dma_fence_get(job->done_fence);
> +	done_fence = dma_fence_get(done_fence);
>  
>  out_unlock:
>  	mutex_unlock(&sched->lock);
> @@ -3403,7 +3430,7 @@ queue_timedout_job(struct drm_sched_job *sched_job)
>  	struct panthor_queue *queue = group->queues[job->queue_idx];
>  
>  	drm_warn(&ptdev->base, "job timeout: pid=%d, comm=%s, seqno=%llu\n",
> -		 group->task_info.pid, group->task_info.comm, job->done_fence->seqno);
> +		 group->task_info.pid, group->task_info.comm, job_done_fence(job)->seqno);
>  
>  	drm_WARN_ON(&ptdev->base, atomic_read(&sched->reset.in_progress));
>  
> @@ -3915,10 +3942,10 @@ static void job_release(struct kref *ref)
>  	if (job->base.s_fence)
>  		drm_sched_job_cleanup(&job->base);
>  
> -	if (job->done_fence && job->done_fence->ops)
> -		dma_fence_put(job->done_fence);
> +	if (job_done_fence_initialized(job))
> +		dma_fence_put(job_done_fence(job));
>  	else
> -		dma_fence_free(job->done_fence);
> +		dma_fence_free(job_done_fence(job));
>  
>  	group_put(job->group);
>  
> @@ -4011,11 +4038,15 @@ panthor_job_create(struct panthor_file *pfile,
>  	 * the previously submitted job.
>  	 */
>  	if (job->call_info.size) {
> -		job->done_fence = kzalloc_obj(*job->done_fence);
> -		if (!job->done_fence) {
> +		struct dma_fence *done_fence;
> +
> +		done_fence = kzalloc_obj(*done_fence);
> +		if (!done_fence) {
>  			ret = -ENOMEM;
>  			goto err_put_job;
>  		}
> +
> +		job->done_fence = (uintptr_t)done_fence;
>  	}
>  
>  	job->profiling.mask = pfile->ptdev->profile_mask;
> -- 
> 2.53.0
> 

-- 
====================
| I would like to |
| fix the world,  |
| but they're not |
| giving me the   |
 \ source code!  /
  ---------------
    ¯\_(ツ)_/¯

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic
  2026-03-09 10:30 [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic Boris Brezillon
  2026-03-09 10:50 ` Christian König
  2026-03-09 11:05 ` Liviu Dudau
@ 2026-03-09 11:06 ` Nicolas Frattaroli
  2026-03-10  2:25 ` Claude review: " Claude Code Review Bot
  2026-03-10  2:25 ` Claude Code Review Bot
  4 siblings, 0 replies; 12+ messages in thread
From: Nicolas Frattaroli @ 2026-03-09 11:06 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe,
	Boris Brezillon
  Cc: dri-devel, kernel, Tvrtko Ursulin, Philipp Stanner,
	Christian König

On Monday, 9 March 2026 11:30:53 Central European Standard Time Boris Brezillon wrote:
> After commit 541c8f2468b9 ("dma-buf: detach fence ops on signal v3"),
> dma_fence::ops == NULL can't be used to check if the fence is initialized
> or not. We could turn this into an "is_signaled() || ops == NULL" test,
> but that's fragile, since it's still subject to dma_fence internal
> changes. So let's have the "is_initialized" state encoded directly in
> the pointer through the lowest bit which is guaranteed to be unused
> because of the dma_fence alignment constraint.
> 
> Cc: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> Cc: Philipp Stanner <phasta@kernel.org>
> Cc: Christian König <christian.koenig@amd.com>
> Reported-by: Steven Price <steven.price@arm.com>
> Reported-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
> Fixes: 541c8f2468b9 ("dma-buf: detach fence ops on signal v3")
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 69 ++++++++++++++++++-------
>  1 file changed, 50 insertions(+), 19 deletions(-)

Tested-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>

This fixes a refcount underflow for me when I do something that spams
a bunch of fatal exceptions and then terminate the userspace
application that produces them.



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic
  2026-03-09 10:50 ` Christian König
@ 2026-03-09 11:06   ` Boris Brezillon
  0 siblings, 0 replies; 12+ messages in thread
From: Boris Brezillon @ 2026-03-09 11:06 UTC (permalink / raw)
  To: Christian König
  Cc: Steven Price, Liviu Dudau, Adrián Larumbe, dri-devel, kernel,
	Nicolas Frattaroli, Tvrtko Ursulin, Philipp Stanner

On Mon, 9 Mar 2026 11:50:16 +0100
Christian König <christian.koenig@amd.com> wrote:

> On 3/9/26 11:30, Boris Brezillon wrote:
> > After commit 541c8f2468b9 ("dma-buf: detach fence ops on signal
> > v3"), dma_fence::ops == NULL can't be used to check if the fence is
> > initialized or not. We could turn this into an "is_signaled() ||
> > ops == NULL" test, but that's fragile, since it's still subject to
> > dma_fence internal changes. So let's have the "is_initialized"
> > state encoded directly in the pointer through the lowest bit which
> > is guaranteed to be unused because of the dma_fence alignment
> > constraint.  
> 
> We added the dma_fence_was_initialized() function exactly for this
> use case and fixed all callers.

Oh, I somehow missed that helper. Will respin. Thanks!

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic
  2026-03-09 11:05 ` Liviu Dudau
@ 2026-03-09 13:15   ` Boris Brezillon
  2026-03-09 14:54     ` Liviu Dudau
  0 siblings, 1 reply; 12+ messages in thread
From: Boris Brezillon @ 2026-03-09 13:15 UTC (permalink / raw)
  To: Liviu Dudau
  Cc: Steven Price, Adrián Larumbe, dri-devel, kernel,
	Nicolas Frattaroli, Tvrtko Ursulin, Philipp Stanner,
	Christian König

On Mon, 9 Mar 2026 11:05:06 +0000
Liviu Dudau <liviu.dudau@arm.com> wrote:

> > After commit 541c8f2468b9 ("dma-buf: detach fence ops on signal v3"),
> > dma_fence::ops == NULL can't be used to check if the fence is initialized
> > or not. We could turn this into an "is_signaled() || ops == NULL" test,
> > but that's fragile, since it's still subject to dma_fence internal
> > changes. So let's have the "is_initialized" state encoded directly in
> > the pointer through the lowest bit which is guaranteed to be unused
> > because of the dma_fence alignment constraint.  
> 
> I'm confused! There is only one place where we end up being interested if the
> fence has been initialized or not, and that is in job_release(). I don't
> see why checking for "ops != NULL" before calling dma_fence_put() should not
> be enough,

Because after 541c8f2468b9 ("dma-buf: detach fence ops on signal v3"),
dma_fence->ops is set back to NULL at signal time[1].

> or even better, why don't we call dma_fence_put() regardless,
> as the core code should take care of an uninitialized dma_fence AFAICT.

When the job is created, we pre-allocate the done_fence, but we leave it
uninitialized until ::run_job() is called. If we call
dma_fence_release() (through dma_fence_put()) on a dma_fence that was
not dma_fence_init()-ialized, we have a NULL deref on the cb_list, and
probably other issues too.

[1]https://gitlab.freedesktop.org/drm/misc/kernel/-/blob/drm-misc-next/drivers/dma-buf/dma-fence.c?ref_type=heads#L373

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic
  2026-03-09 13:15   ` Boris Brezillon
@ 2026-03-09 14:54     ` Liviu Dudau
  2026-03-09 15:32       ` Boris Brezillon
  0 siblings, 1 reply; 12+ messages in thread
From: Liviu Dudau @ 2026-03-09 14:54 UTC (permalink / raw)
  To: Boris Brezillon
  Cc: Steven Price, Adrián Larumbe, dri-devel, kernel,
	Nicolas Frattaroli, Tvrtko Ursulin, Philipp Stanner,
	Christian König

On Mon, Mar 09, 2026 at 02:15:49PM +0100, Boris Brezillon wrote:
> On Mon, 9 Mar 2026 11:05:06 +0000
> Liviu Dudau <liviu.dudau@arm.com> wrote:
> 
> > > After commit 541c8f2468b9 ("dma-buf: detach fence ops on signal v3"),
> > > dma_fence::ops == NULL can't be used to check if the fence is initialized
> > > or not. We could turn this into an "is_signaled() || ops == NULL" test,
> > > but that's fragile, since it's still subject to dma_fence internal
> > > changes. So let's have the "is_initialized" state encoded directly in
> > > the pointer through the lowest bit which is guaranteed to be unused
> > > because of the dma_fence alignment constraint.  
> > 
> > I'm confused! There is only one place where we end up being interested if the
> > fence has been initialized or not, and that is in job_release(). I don't
> > see why checking for "ops != NULL" before calling dma_fence_put() should not
> > be enough,
> 
> Because after 541c8f2468b9 ("dma-buf: detach fence ops on signal v3"),
> dma_fence->ops is set back to NULL at signal time[1].

Yes, I gathered that. What I meant to say was that I don't understand why we need
all this infrastructure just for one check. Meanwhile Christian pointed out that
a simpler solution already exists.

> 
> > or even better, why don't we call dma_fence_put() regardless,
> > as the core code should take care of an uninitialized dma_fence AFAICT.
> 
> When the job is created, we pre-allocate the done_fence, but we leave it
> uninitialized until ::run_job() is called. If we call
> dma_fence_release() (through dma_fence_put()) on a dma_fence that was
> not dma_fence_init()-ialized, we have a NULL deref on the cb_list, and
> probably other issues too.

I don't see the benefit of not initializing the done_fence until we ::run_job()
but I might have missed something obvious. If we want to keep that, maybe we
should not be droping the reference in job_release() but when we
signal the fence. But that would leak the memory of the uninitialized done_fence.

Best regards,
Liviu

> 
> [1]https://gitlab.freedesktop.org/drm/misc/kernel/-/blob/drm-misc-next/drivers/dma-buf/dma-fence.c?ref_type=heads#L373

-- 
====================
| I would like to |
| fix the world,  |
| but they're not |
| giving me the   |
 \ source code!  /
  ---------------
    ¯\_(ツ)_/¯

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic
  2026-03-09 14:54     ` Liviu Dudau
@ 2026-03-09 15:32       ` Boris Brezillon
  0 siblings, 0 replies; 12+ messages in thread
From: Boris Brezillon @ 2026-03-09 15:32 UTC (permalink / raw)
  To: Liviu Dudau
  Cc: Steven Price, Adrián Larumbe, dri-devel, kernel,
	Nicolas Frattaroli, Tvrtko Ursulin, Philipp Stanner,
	Christian König

On Mon, 9 Mar 2026 14:54:21 +0000
Liviu Dudau <liviu.dudau@arm.com> wrote:

> On Mon, Mar 09, 2026 at 02:15:49PM +0100, Boris Brezillon wrote:
> > On Mon, 9 Mar 2026 11:05:06 +0000
> > Liviu Dudau <liviu.dudau@arm.com> wrote:
> >   
> > > > After commit 541c8f2468b9 ("dma-buf: detach fence ops on signal v3"),
> > > > dma_fence::ops == NULL can't be used to check if the fence is initialized
> > > > or not. We could turn this into an "is_signaled() || ops == NULL" test,
> > > > but that's fragile, since it's still subject to dma_fence internal
> > > > changes. So let's have the "is_initialized" state encoded directly in
> > > > the pointer through the lowest bit which is guaranteed to be unused
> > > > because of the dma_fence alignment constraint.    
> > > 
> > > I'm confused! There is only one place where we end up being interested if the
> > > fence has been initialized or not, and that is in job_release(). I don't
> > > see why checking for "ops != NULL" before calling dma_fence_put() should not
> > > be enough,  
> > 
> > Because after 541c8f2468b9 ("dma-buf: detach fence ops on signal v3"),
> > dma_fence->ops is set back to NULL at signal time[1].  
> 
> Yes, I gathered that. What I meant to say was that I don't understand why we need
> all this infrastructure just for one check. Meanwhile Christian pointed out that
> a simpler solution already exists.
> 
> >   
> > > or even better, why don't we call dma_fence_put() regardless,
> > > as the core code should take care of an uninitialized dma_fence AFAICT.  
> > 
> > When the job is created, we pre-allocate the done_fence, but we leave it
> > uninitialized until ::run_job() is called. If we call
> > dma_fence_release() (through dma_fence_put()) on a dma_fence that was
> > not dma_fence_init()-ialized, we have a NULL deref on the cb_list, and
> > probably other issues too.  
> 
> I don't see the benefit of not initializing the done_fence until we ::run_job()
> but I might have missed something obvious.

It has to do with the way we connect dma_fence::seqno to the CS_SYNC
object seqno. The submission process is a multi-step operation:

for_each_job() // can fail
	1. allocate and initialize resources (including dma_fence and
	   drm_sched_fence objects)
	2. gather deps

for_each_job() // can't fail
	3. arm drm_sched fences
	4. queue jobs
	5. update syncobjs with the drm_sched fences

If anything fails before step3, we rollback all we've done. Now, if we
were initializing the job::dma_fence when we allocate it, we would
consume a seqno on the panthor_queue, and because the execute-job
sequence assumes the seqno increases monotonically (SYNC_ADD(+1)), we
can't leave holes behind, which would happen if we were initializing at
alloc time and something fails half way through the submission process.
There are ways around it, like using SYNC_SET(seqno) instead of
SYNC_ADD(+1), but those changes are more invasive than delaying the
initialization of the ::done_fence object.

> If we want to keep that, maybe we
> should not be droping the reference in job_release() but when we
> signal the fence.

If we assume that several paths call dma_fence_signal[_locked](), that'd
mean more code and more chances to forget the
dma_fence_put()+done_fence=NULL in case new paths are added. That, or we
need a panthor_job_signal_done_fence() wrapper.

> But that would leak the memory of the uninitialized done_fence.

Yes, the problem with uninitialized fences remains: as soon as we have
this two-step model where allocation and initialization is split, we
need to deal with both cases in the cleanup path.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Claude review: drm/panthor: Fix the "done_fence is initialized" detection logic
  2026-03-09 12:43 [PATCH v2] " Boris Brezillon
  2026-03-10  2:23 ` Claude review: " Claude Code Review Bot
@ 2026-03-10  2:23 ` Claude Code Review Bot
  1 sibling, 0 replies; 12+ messages in thread
From: Claude Code Review Bot @ 2026-03-10  2:23 UTC (permalink / raw)
  To: dri-devel-reviews

Overall Series Review

Subject: drm/panthor: Fix the "done_fence is initialized" detection logic
Author: Boris Brezillon <boris.brezillon@collabora.com>
Patches: 5
Reviewed: 2026-03-10T12:23:45.443064

---

This is a single-patch fix for the panthor GPU scheduler, addressing a regression introduced by commit `541c8f2468b9 ("dma-buf: detach fence ops on signal v3")`. That commit made `dma_fence::ops` get set to `NULL` upon fence signaling, which broke panthor's use of `->ops` as a proxy for "was this fence initialized." The fix correctly switches to the dedicated `dma_fence_was_initialized()` helper.

The patch is clean, correct, and well-motivated. **Looks good.**

---
Generated by Claude Code Patch Reviewer

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Claude review: drm/panthor: Fix the "done_fence is initialized" detection logic
  2026-03-09 12:43 [PATCH v2] " Boris Brezillon
@ 2026-03-10  2:23 ` Claude Code Review Bot
  2026-03-10  2:23 ` Claude Code Review Bot
  1 sibling, 0 replies; 12+ messages in thread
From: Claude Code Review Bot @ 2026-03-10  2:23 UTC (permalink / raw)
  To: dri-devel-reviews

Patch Review

**Correctness**: The fix is correct. The old code:

```c
if (job->done_fence && job->done_fence->ops)
```

relied on `->ops` being non-NULL to distinguish an initialized (refcounted) fence from a merely allocated (kzalloc'd) one. After the upstream dma-buf change that clears `->ops` on signal, a signaled-but-initialized fence would incorrectly fall through to `dma_fence_free()` instead of `dma_fence_put()`, potentially causing a use-after-free or double-free.

The replacement:

```c
if (dma_fence_was_initialized(job->done_fence))
```

uses `dma_fence_was_initialized()` which checks `DMA_FENCE_FLAG_INITIALIZED_BIT` in `fence->flags` — a stable flag that survives signaling. It also handles the `fence == NULL` case internally (line 295 of `dma-fence.h`: `return fence && test_bit(...)`), so the NULL check that was previously part of the condition is preserved.

**Commit message**: Clear and well-structured. The `Fixes:` tag, `Reported-by:` tags, and v2 changelog are all present and correct.

**No issues found.** This is a straightforward, correct bug fix.

**Reviewed-by worthy**: Yes. The patch is minimal, well-targeted, and correctly uses the API designed for exactly this purpose.

---
Generated by Claude Code Patch Reviewer

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Claude review: drm/panthor: Fix the "done_fence is initialized" detection logic
  2026-03-09 10:30 [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic Boris Brezillon
                   ` (2 preceding siblings ...)
  2026-03-09 11:06 ` Nicolas Frattaroli
@ 2026-03-10  2:25 ` Claude Code Review Bot
  2026-03-10  2:25 ` Claude Code Review Bot
  4 siblings, 0 replies; 12+ messages in thread
From: Claude Code Review Bot @ 2026-03-10  2:25 UTC (permalink / raw)
  To: dri-devel-reviews

Overall Series Review

Subject: drm/panthor: Fix the "done_fence is initialized" detection logic
Author: Boris Brezillon <boris.brezillon@collabora.com>
Patches: 8
Reviewed: 2026-03-10T12:25:51.999443

---

This is a single-patch fix for the panthor GPU scheduler addressing a regression introduced by commit 541c8f2468b9 ("dma-buf: detach fence ops on signal v3"). That dma-buf change nullifies `dma_fence::ops` on signal, which broke panthor's `job_release()` logic that relied on `ops != NULL` to determine whether a fence was initialized (via `dma_fence_init()`) vs merely allocated (via `kzalloc`).

The approach — encoding an "initialized" flag in the low bit of the pointer — is a well-known kernel pattern and is sound given `dma_fence` alignment guarantees. The patch is generally well-structured, but I have concerns about correctness in certain paths.

**Verdict:** The fix addresses a real bug and the approach is reasonable, but there are issues that should be addressed before merging.

---
Generated by Claude Code Patch Reviewer

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Claude review: drm/panthor: Fix the "done_fence is initialized" detection logic
  2026-03-09 10:30 [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic Boris Brezillon
                   ` (3 preceding siblings ...)
  2026-03-10  2:25 ` Claude review: " Claude Code Review Bot
@ 2026-03-10  2:25 ` Claude Code Review Bot
  4 siblings, 0 replies; 12+ messages in thread
From: Claude Code Review Bot @ 2026-03-10  2:25 UTC (permalink / raw)
  To: dri-devel-reviews

Patch Review

**Positive aspects:**
- Clean encapsulation via `job_done_fence()` and `job_done_fence_initialized()` helpers
- Good use of `uintptr_t` to prevent accidental direct dereference
- The comment on the struct field explaining the encoding is helpful

**Issue 1 (potential bug): `job_release` calls `dma_fence_free()` on uninitialized NULL pointer**

In the patched `job_release()`:
```c
if (job_done_fence_initialized(job))
    dma_fence_put(job_done_fence(job));
else
    dma_fence_free(job_done_fence(job));
```

When `job->call_info.size == 0`, no `dma_fence` is allocated in `panthor_job_create()`, so `job->done_fence` is 0. If the job never reaches `queue_run_job()` (e.g., it's cleaned up early), then `job_done_fence_initialized()` is false, and `dma_fence_free(NULL)` is called. Looking at `dma_fence_free()`, it calls `kfree(rcu_to_ptr(fence))` — calling this with NULL should be safe since `kfree(NULL)` is a no-op, but only if `dma_fence_free` handles NULL gracefully. The original code had the same pattern (`dma_fence_free(NULL)`), so this isn't a regression, but it's worth noting.

**Issue 2 (correctness concern): Missing initialized bit in the `!job->call_info.size` early-return path**

In `queue_run_job()`, the `!job->call_info.size` path:
```c
if (!job->call_info.size) {
    done_fence = dma_fence_get(queue->fence_ctx.last_fence);

    job->done_fence = (uintptr_t)done_fence | DONE_FENCE_INITIALIZED;
    return dma_fence_get(done_fence);
}
```

This correctly sets `DONE_FENCE_INITIALIZED`. However, `queue->fence_ctx.last_fence` could theoretically be NULL on the very first submission to a queue. If `last_fence` is NULL, then `dma_fence_get(NULL)` returns NULL, and `job->done_fence` becomes `0 | 1 = 1`. Then `job_done_fence(job)` returns `(void *)(1 & ~1) = NULL`, but `job_done_fence_initialized()` returns true, so `job_release` would call `dma_fence_put(NULL)` which should crash. The original code had the same latent issue, so this is not a regression from this patch.

**Issue 3 (style): The `done_fence` variable initialization in `queue_run_job()`**

In the main path of `queue_run_job()`:
```c
done_fence = job_done_fence(job);
dma_fence_init(done_fence,
               &panthor_queue_fence_ops,
               &queue->fence_ctx.lock,
               queue->fence_ctx.id,
               atomic64_inc_return(&queue->fence_ctx.seqno));
job->done_fence |= DONE_FENCE_INITIALIZED;
```

The `done_fence` local variable was declared at the top of the function but the original code didn't assign it until later. In the patched version it's now assigned via `job_done_fence(job)` (extracting the raw pointer from `kzalloc` allocation in `panthor_job_create`). This is correct since the allocation path in `panthor_job_create` stores the pointer without the initialized bit:
```c
job->done_fence = (uintptr_t)done_fence;
```

Then after `dma_fence_init`, the bit is ORed in. The sequencing is correct.

**Issue 4 (minor): `dma_fence_free` in the else branch may be incorrect for the `!call_info.size` case**

After the patch, `job_release` does:
```c
if (job_done_fence_initialized(job))
    dma_fence_put(job_done_fence(job));
else
    dma_fence_free(job_done_fence(job));
```

The `else` branch covers the case where `done_fence` was `kzalloc`'d but never `dma_fence_init`'d (i.e., `queue_run_job` failed before calling `dma_fence_init`). In this case `dma_fence_free` is appropriate since it just does `kfree`. This is correct.

**Issue 5 (robustness): No compile-time assertion on dma_fence alignment**

The commit message states the lowest bit is "guaranteed to be unused because of the dma_fence alignment constraint," but there's no `static_assert` or `BUILD_BUG_ON` to verify this. Adding something like:

```c
BUILD_BUG_ON(__alignof__(struct dma_fence) < 2);
```

near the `DONE_FENCE_INITIALIZED` definition would make this assumption explicit and catch any future changes.

**Issue 6 (double dma_fence_get in out_unlock path):**

In `queue_run_job()`:
```c
queue->fence_ctx.last_fence = dma_fence_get(done_fence);

done_fence = dma_fence_get(done_fence);
```

This `done_fence = dma_fence_get(done_fence)` line is peculiar — it's getting a second reference on the already-local `done_fence` to return to the caller. This existed in the original code too (`done_fence = dma_fence_get(job->done_fence)`), so it's not a regression, but the rewrite makes the self-assignment more visually confusing. A comment would help clarify this is intentional (one ref for `last_fence`, one ref for the return value).

**Overall:** The fix is correct for the stated problem. I'd recommend adding a `BUILD_BUG_ON` for the alignment assumption. The patch is otherwise clean and well-contained.

---
Generated by Claude Code Patch Reviewer

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2026-03-10  2:25 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-09 10:30 [PATCH] drm/panthor: Fix the "done_fence is initialized" detection logic Boris Brezillon
2026-03-09 10:50 ` Christian König
2026-03-09 11:06   ` Boris Brezillon
2026-03-09 11:05 ` Liviu Dudau
2026-03-09 13:15   ` Boris Brezillon
2026-03-09 14:54     ` Liviu Dudau
2026-03-09 15:32       ` Boris Brezillon
2026-03-09 11:06 ` Nicolas Frattaroli
2026-03-10  2:25 ` Claude review: " Claude Code Review Bot
2026-03-10  2:25 ` Claude Code Review Bot
  -- strict thread matches above, loose matches on Subject: below --
2026-03-09 12:43 [PATCH v2] " Boris Brezillon
2026-03-10  2:23 ` Claude review: " Claude Code Review Bot
2026-03-10  2:23 ` Claude Code Review Bot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox