public inbox for drm-ai-reviews@public-inbox.freedesktop.org
 help / color / mirror / Atom feed
From: Mario Limonciello <superm1@kernel.org>
To: Lizhi Hou <lizhi.hou@amd.com>,
	ogabbay@kernel.org, quic_jhugo@quicinc.com,
	dri-devel@lists.freedesktop.org,
	maciej.falkowski@linux.intel.com
Cc: Max Zhen <max.zhen@amd.com>,
	linux-kernel@vger.kernel.org, sonal.santan@amd.com
Subject: Re: [PATCH V1] accel/amdxdna: Improve tracing for job lifecycle and mailbox RX worker
Date: Tue, 21 Apr 2026 14:45:52 -0500	[thread overview]
Message-ID: <f3d1b859-e5f2-468c-b957-fc3686457213@kernel.org> (raw)
In-Reply-To: <9bca1ecd-ce16-2703-3a0d-6db208c83b06@amd.com>



On 4/21/26 14:39, Lizhi Hou wrote:
> 
> On 4/21/26 12:18, Mario Limonciello wrote:
>>
>>
>> On 4/21/26 13:15, Lizhi Hou wrote:
>>> From: Max Zhen <max.zhen@amd.com>
>>>
>>> Add more trace coverage to amdxdna job handling and mailbox receive
>>> processing to make driver execution easier to debug.
>>>
>>> Extend the xdna_job trace event to record the command opcode in
>>> addition to the job sequence number. Use the enhanced tracepoint in
>>> the job run, sent-to-device, signaled-fence, and job-free paths so
>>> that trace output can be correlated with the command being executed.
>>>
>>> Also add debug-point tracing when a command is received through the
>>> submit ioctl path, and add a trace event when the mailbox RX worker
>>> runs.
>>>
>>> These changes improve visibility into job lifetime transitions and
>>> mailbox activity, which helps debug command flow and scheduler issues.
>>>
>>> Signed-off-by: Max Zhen <max.zhen@amd.com>
>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org>
>>> ---
>>>   drivers/accel/amdxdna/aie2_ctx.c        | 14 ++++++---
>>>   drivers/accel/amdxdna/amdxdna_ctx.c     |  3 +-
>>>   drivers/accel/amdxdna/amdxdna_ctx.h     |  1 +
>>>   drivers/accel/amdxdna/amdxdna_mailbox.c |  1 +
>>>   include/trace/events/amdxdna.h          | 42 ++++++++++++++++---------
>>>   5 files changed, 42 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/ 
>>> amdxdna/aie2_ctx.c
>>> index d37123d925b6..3b0feba448c4 100644
>>> --- a/drivers/accel/amdxdna/aie2_ctx.c
>>> +++ b/drivers/accel/amdxdna/aie2_ctx.c
>>> @@ -64,6 +64,7 @@ static void aie2_job_release(struct kref *ref)
>>>       struct amdxdna_sched_job *job;
>>>         job = container_of(ref, struct amdxdna_sched_job, refcnt);
>>> +
>>>       amdxdna_sched_job_cleanup(job);
>>>       atomic64_inc(&job->hwctx->job_free_cnt);
>>>       wake_up(&job->hwctx->priv->job_free_wq);
>>> @@ -195,7 +196,8 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
>>>   {
>>>       struct dma_fence *fence = job->fence;
>>>   -    trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", 
>>> job->seq);
>>> +    trace_xdna_job(&job->base, job->hwctx->name, "signaling fence",
>>> +               job->seq, job->drv_cmd ? job->drv_cmd->opcode : 
>>> DEFAULT_IO);
>>>         aie2_tdr_signal(job->hwctx->client->xdna);
>>>       job->hwctx->priv->completed++;
>>> @@ -366,6 +368,9 @@ aie2_sched_job_run(struct drm_sched_job *sched_job)
>>>       struct dma_fence *fence;
>>>       int ret;
>>>   +    trace_xdna_job(sched_job, hwctx->name, "job run",
>>> +               job->seq, job->drv_cmd ? job->drv_cmd->opcode : 
>>> DEFAULT_IO);
>>> +
>>>       if (!hwctx->priv->mbox_chann)
>>>           return NULL;
>>>   @@ -409,7 +414,8 @@ aie2_sched_job_run(struct drm_sched_job 
>>> *sched_job)
>>>       } else {
>>>           aie2_tdr_signal(hwctx->client->xdna);
>>>       }
>>> -    trace_xdna_job(sched_job, hwctx->name, "sent to device", job->seq);
>>> +    trace_xdna_job(sched_job, hwctx->name, "sent to device",
>>> +               job->seq, job->drv_cmd ? job->drv_cmd->opcode : 
>>> DEFAULT_IO);
>>>         return fence;
>>>   }
>>> @@ -419,7 +425,8 @@ static void aie2_sched_job_free(struct 
>>> drm_sched_job *sched_job)
>>>       struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
>>>       struct amdxdna_hwctx *hwctx = job->hwctx;
>>>   -    trace_xdna_job(sched_job, hwctx->name, "job free", job->seq);
>>> +    trace_xdna_job(sched_job, hwctx->name, "job free",
>>> +               job->seq, job->drv_cmd ? job->drv_cmd->opcode : 
>>> DEFAULT_IO);
>>>       if (!job->job_done)
>>>           up(&hwctx->priv->job_sem);
>>>   @@ -437,7 +444,6 @@ aie2_sched_job_timedout(struct drm_sched_job 
>>> *sched_job)
>>>       int ret;
>>>         xdna = hwctx->client->xdna;
>>> -    trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
>>>         guard(mutex)(&xdna->dev_lock);
>>>   diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/ 
>>> amdxdna/amdxdna_ctx.c
>>> index ff6c3e8e5a15..2c2c21992c87 100644
>>> --- a/drivers/accel/amdxdna/amdxdna_ctx.c
>>> +++ b/drivers/accel/amdxdna/amdxdna_ctx.c
>>> @@ -514,7 +514,6 @@ int amdxdna_cmd_submit(struct amdxdna_client 
>>> *client,
>>>           goto unlock_srcu;
>>>       }
>>>   -
>>>       job->hwctx = hwctx;
>>>       job->mm = current->mm;
>>>   @@ -612,6 +611,8 @@ int amdxdna_drm_submit_cmd_ioctl(struct 
>>> drm_device *dev, void *data, struct drm_
>>>       if (args->ext || args->ext_flags)
>>>           return -EINVAL;
>>>   +    trace_amdxdna_debug_point(current->comm, args->type, "job 
>>> received");
>>> +
>>>       switch (args->type) {
>>>       case AMDXDNA_CMD_SUBMIT_EXEC_BUF:
>>>           return amdxdna_drm_submit_execbuf(client, args);
>>> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/ 
>>> amdxdna/amdxdna_ctx.h
>>> index a8557d7e8923..355798687376 100644
>>> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
>>> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
>>> @@ -119,6 +119,7 @@ struct amdxdna_hwctx {
>>>       container_of(j, struct amdxdna_sched_job, base)
>>>     enum amdxdna_job_opcode {
>>> +    DEFAULT_IO,
>>
>> Do you really want this at the beginning of the list?  Doesn't that 
>> break uses of amdxdna_drv_cmd that has the previous indexing?
> 
> *_DEBUG_BO is driver internal use only. Using 0 here to align with our 
> current trace scripts.
> 
> Lizhi
> 
>>
>>>       SYNC_DEBUG_BO,
>>>       ATTACH_DEBUG_BO,
>>>       DETACH_DEBUG_BO,
>>> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/ 
>>> amdxdna/amdxdna_mailbox.c
>>> index 37771bdb24a1..cc8865f4e79c 100644
>>> --- a/drivers/accel/amdxdna/amdxdna_mailbox.c
>>> +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
>>> @@ -361,6 +361,7 @@ static void mailbox_rx_worker(struct work_struct 
>>> *rx_work)
>>>       int ret;
>>>         mb_chann = container_of(rx_work, struct mailbox_channel, 
>>> rx_work);
>>> +    trace_mbox_rx_worker(MAILBOX_NAME, mb_chann->msix_irq);
>>>         if (READ_ONCE(mb_chann->bad_state)) {
>>>           MB_ERR(mb_chann, "Channel in bad state, work aborted");
>>> diff --git a/include/trace/events/amdxdna.h b/include/trace/events/ 
>>> amdxdna.h
>>> index c6cb2da7b706..71da24267e52 100644
>>> --- a/include/trace/events/amdxdna.h
>>> +++ b/include/trace/events/amdxdna.h
>>> @@ -30,26 +30,30 @@ TRACE_EVENT(amdxdna_debug_point,
>>>   );
>>>     TRACE_EVENT(xdna_job,
>>> -        TP_PROTO(struct drm_sched_job *sched_job, const char *name, 
>>> const char *str, u64 seq),
>>> +        TP_PROTO(struct drm_sched_job *sched_job, const char *name,
>>> +             const char *str, u64 seq, u32 op),
>>>   -        TP_ARGS(sched_job, name, str, seq),
>>> +        TP_ARGS(sched_job, name, str, seq, op),
>>>             TP_STRUCT__entry(__string(name, name)
>>>                    __string(str, str)
>>>                    __field(u64, fence_context)
>>>                    __field(u64, fence_seqno)
>>> -                 __field(u64, seq)),
>>> +                 __field(u64, seq)
>>> +                 __field(u32, op)),
>>>             TP_fast_assign(__assign_str(name);
>>>                  __assign_str(str);
>>>                  __entry->fence_context = sched_job->s_fence- 
>>> >finished.context;
>>>                  __entry->fence_seqno = sched_job->s_fence- 
>>> >finished.seqno;
>>> -               __entry->seq = seq;),
>>> +               __entry->seq = seq;
>>> +               __entry->op = op;),
>>>   -        TP_printk("fence=(context:%llu, seqno:%lld), %s seq#:%lld 
>>> %s",
>>> +        TP_printk("fence=(context:%llu, seqno:%llu), %s seq#:%llu 
>>> %s, op=%u",
>>>                 __entry->fence_context, __entry->fence_seqno,
>>>                 __get_str(name), __entry->seq,
>>> -              __get_str(str))
>>> +              __get_str(str),
>>> +              __entry->op)
>>>   );
>>>     DECLARE_EVENT_CLASS(xdna_mbox_msg,
>>> @@ -81,18 +85,28 @@ DEFINE_EVENT(xdna_mbox_msg, mbox_set_head,
>>>            TP_ARGS(name, chann_id, opcode, id)
>>>   );
>>>   -TRACE_EVENT(mbox_irq_handle,
>>> -        TP_PROTO(char *name, int irq),
>>> +DECLARE_EVENT_CLASS(xdna_mbox_name_id,
>>> +            TP_PROTO(char *name, int irq),
>>>   -        TP_ARGS(name, irq),
>>> +            TP_ARGS(name, irq),
>>>   -        TP_STRUCT__entry(__string(name, name)
>>> -                 __field(int, irq)),
>>> +            TP_STRUCT__entry(__string(name, name)
>>> +                     __field(int, irq)),
>>>   -        TP_fast_assign(__assign_str(name);
>>> -               __entry->irq = irq;),
>>> +            TP_fast_assign(__assign_str(name);
>>> +                   __entry->irq = irq;),
>>> +
>>> +            TP_printk("%s.%d", __get_str(name), __entry->irq)
>>> +);
>>> +
>>> +DEFINE_EVENT(xdna_mbox_name_id, mbox_irq_handle,
>>> +         TP_PROTO(char *name, int irq),
>>> +         TP_ARGS(name, irq)
>>> +);
>>>   -        TP_printk("%s.%d", __get_str(name), __entry->irq)
>>> +DEFINE_EVENT(xdna_mbox_name_id, mbox_rx_worker,
>>> +         TP_PROTO(char *name, int irq),
>>> +         TP_ARGS(name, irq)
>>>   );
>>>     #endif /* !defined(_TRACE_AMDXDNA_H) || 
>>> defined(TRACE_HEADER_MULTI_READ) */
>>


  reply	other threads:[~2026-04-21 19:45 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-21 18:15 [PATCH V1] accel/amdxdna: Improve tracing for job lifecycle and mailbox RX worker Lizhi Hou
2026-04-21 19:18 ` Mario Limonciello
2026-04-21 19:39   ` Lizhi Hou
2026-04-21 19:45     ` Mario Limonciello [this message]
2026-04-22 15:38       ` Lizhi Hou
2026-04-22 22:11 ` Claude review: " Claude Code Review Bot
2026-04-22 22:11 ` Claude Code Review Bot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f3d1b859-e5f2-468c-b957-fc3686457213@kernel.org \
    --to=superm1@kernel.org \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lizhi.hou@amd.com \
    --cc=maciej.falkowski@linux.intel.com \
    --cc=max.zhen@amd.com \
    --cc=ogabbay@kernel.org \
    --cc=quic_jhugo@quicinc.com \
    --cc=sonal.santan@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox