* [PATCH V1] accel/amdxdna: add AIE coredump support
@ 2026-05-26 17:29 Lizhi Hou
2026-05-27 4:28 ` Claude review: " Claude Code Review Bot
2026-05-27 4:28 ` Claude Code Review Bot
0 siblings, 2 replies; 3+ messages in thread
From: Lizhi Hou @ 2026-05-26 17:29 UTC (permalink / raw)
To: ogabbay, quic_jhugo, dri-devel, mario.limonciello,
karol.wachowski
Cc: Nishad Saraf, linux-kernel, max.zhen, sonal.santan, Lizhi Hou
From: Nishad Saraf <nishads@amd.com>
Implement AIE coredump to capture per-context tile memory snapshots
for post-mortem hardware debugging. The new DRM_AMDXDNA_AIE_COREDUMP
parameter in the GET_ARRAY ioctl returns coredump data for a given
hardware context. Access is restricted so that users may only coredump
their own contexts; admin users may coredump any context.
Signed-off-by: Nishad Saraf <nishads@amd.com>
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
drivers/accel/amdxdna/aie.c | 214 +++++++++++++++++++++---
drivers/accel/amdxdna/aie.h | 31 +++-
drivers/accel/amdxdna/aie2_error.c | 22 +--
drivers/accel/amdxdna/aie2_message.c | 104 +++++++-----
drivers/accel/amdxdna/aie2_msg_priv.h | 15 ++
drivers/accel/amdxdna/aie2_pci.c | 5 +
drivers/accel/amdxdna/aie2_pci.h | 4 +
drivers/accel/amdxdna/aie4_message.c | 33 +++-
drivers/accel/amdxdna/aie4_msg_priv.h | 13 ++
drivers/accel/amdxdna/aie4_pci.c | 57 +++++--
drivers/accel/amdxdna/aie4_pci.h | 13 +-
drivers/accel/amdxdna/amdxdna_ctx.h | 1 +
drivers/accel/amdxdna/amdxdna_pci_drv.c | 3 +-
drivers/accel/amdxdna/amdxdna_pci_drv.h | 4 +
drivers/accel/amdxdna/npu3_regs.c | 1 +
include/uapi/drm/amdxdna_accel.h | 26 +++
16 files changed, 451 insertions(+), 95 deletions(-)
diff --git a/drivers/accel/amdxdna/aie.c b/drivers/accel/amdxdna/aie.c
index dd6f36f222c7..bf1243285920 100644
--- a/drivers/accel/amdxdna/aie.c
+++ b/drivers/accel/amdxdna/aie.c
@@ -3,9 +3,12 @@
* Copyright (C) 2026, Advanced Micro Devices, Inc.
*/
+#include <drm/drm_cache.h>
+#include <linux/cred.h>
#include <linux/errno.h>
#include "aie.h"
+#include "amdxdna_ctx.h"
#include "amdxdna_mailbox_helper.h"
#include "amdxdna_mailbox.h"
#include "amdxdna_pci_drv.h"
@@ -132,36 +135,209 @@ int amdxdna_get_metadata(struct aie_device *aie,
return ret;
}
-void *amdxdna_alloc_msg_buffer(struct amdxdna_dev *xdna, u32 *size,
- dma_addr_t *dma_addr)
+struct amdxdna_msg_buf_hdl *amdxdna_alloc_msg_buffer(struct amdxdna_dev *xdna, u32 size)
{
- void *vaddr;
+ struct amdxdna_msg_buf_hdl *hdl;
int order;
- *size = max_t(u32, *size, SZ_8K);
- order = get_order(*size);
+ hdl = kzalloc_obj(*hdl);
+ if (!hdl)
+ return ERR_PTR(-ENOMEM);
+
+ hdl->xdna = xdna;
+ hdl->size = max_t(u32, size, SZ_8K);
+ order = get_order(hdl->size);
if (order > MAX_PAGE_ORDER)
- return ERR_PTR(-EINVAL);
- *size = PAGE_SIZE << order;
+ goto free_hdl;
+ hdl->size = PAGE_SIZE << order;
- if (amdxdna_iova_on(xdna))
- return amdxdna_iommu_alloc(xdna, *size, dma_addr);
+ if (amdxdna_iova_on(xdna)) {
+ hdl->vaddr = amdxdna_iommu_alloc(xdna, hdl->size, &hdl->dma_addr);
+ if (IS_ERR(hdl->vaddr))
+ goto free_hdl;
+ } else {
+ hdl->vaddr = dma_alloc_noncoherent(xdna->ddev.dev, hdl->size,
+ &hdl->dma_addr,
+ DMA_FROM_DEVICE, GFP_KERNEL);
+ if (!hdl->vaddr)
+ goto free_hdl;
+ }
- vaddr = dma_alloc_noncoherent(xdna->ddev.dev, *size, dma_addr,
- DMA_FROM_DEVICE, GFP_KERNEL);
- if (!vaddr)
- return ERR_PTR(-ENOMEM);
+ return hdl;
- return vaddr;
+free_hdl:
+ kfree(hdl);
+ return ERR_PTR(-ENOMEM);
}
-void amdxdna_free_msg_buffer(struct amdxdna_dev *xdna, size_t size,
- void *cpu_addr, dma_addr_t dma_addr)
+void amdxdna_free_msg_buffer(struct amdxdna_msg_buf_hdl *hdl)
{
- if (amdxdna_iova_on(xdna)) {
- amdxdna_iommu_free(xdna, size, cpu_addr, dma_addr);
+ if (!hdl)
return;
+
+ if (amdxdna_iova_on(hdl->xdna)) {
+ amdxdna_iommu_free(hdl->xdna, hdl->size, hdl->vaddr,
+ hdl->dma_addr);
+ } else {
+ dma_free_noncoherent(hdl->xdna->ddev.dev, hdl->size,
+ hdl->vaddr, hdl->dma_addr,
+ DMA_FROM_DEVICE);
+ }
+
+ kfree(hdl);
+}
+
+struct amdxdna_coredump_walk_arg {
+ u64 pid;
+ u32 ctx_id;
+
+ struct aie_device *aie;
+ struct amdxdna_drm_get_array *array_args;
+};
+
+static int amdxdna_get_coredump_cb(struct amdxdna_hwctx *hwctx, void *arg)
+{
+ struct amdxdna_dev *xdna = hwctx->client->xdna;
+ struct amdxdna_coredump_buf_entry *buf_list;
+ struct amdxdna_coredump_walk_arg *wa = arg;
+ struct amdxdna_msg_buf_hdl **data_hdls;
+ struct amdxdna_msg_buf_hdl *list_hdl;
+ struct aie_device *aie = wa->aie;
+ size_t data_buf_size = SZ_1M;
+ size_t total_size;
+ u8 __user *buf;
+ u32 num_bufs;
+ int ret, i;
+
+ if (hwctx->client->pid != wa->pid || hwctx->id != wa->ctx_id)
+ return 0;
+
+ if (!capable(CAP_SYS_ADMIN) &&
+ !uid_eq(current_euid(), hwctx->client->filp->filp->f_cred->euid)) {
+ XDNA_ERR(xdna, "Permission denied for context %u", wa->ctx_id);
+ return -EPERM;
+ }
+
+ num_bufs = (hwctx->num_col - hwctx->num_unused_col) * aie->metadata.rows;
+ total_size = data_buf_size * num_bufs;
+
+ if (wa->array_args->element_size < total_size) {
+ XDNA_DBG(xdna, "Insufficient buffer size %u, need %zu",
+ wa->array_args->element_size, total_size);
+ wa->array_args->element_size = total_size;
+ return -ENOSPC;
+ }
+
+ list_hdl = amdxdna_alloc_msg_buffer(xdna, num_bufs * sizeof(*buf_list));
+ if (IS_ERR(list_hdl)) {
+ XDNA_ERR(xdna, "Failed to allocate buffer list");
+ return PTR_ERR(list_hdl);
+ }
+
+ buf_list = to_cpu_addr(list_hdl, 0);
+ memset(buf_list, 0, to_buf_size(list_hdl));
+
+ data_hdls = kzalloc_objs(*data_hdls, num_bufs);
+ if (!data_hdls) {
+ ret = -ENOMEM;
+ goto free_list_hdl;
+ }
+
+ for (i = 0; i < num_bufs; i++) {
+ data_hdls[i] = amdxdna_alloc_msg_buffer(xdna, data_buf_size);
+ if (IS_ERR(data_hdls[i])) {
+ XDNA_ERR(xdna, "Failed to allocate data buffer %d", i);
+ ret = PTR_ERR(data_hdls[i]);
+ data_hdls[i] = NULL;
+ goto free_data_hdls;
+ }
+
+ memset(to_cpu_addr(data_hdls[i], 0), 0, data_buf_size);
+ drm_clflush_virt_range(to_cpu_addr(data_hdls[i], 0), data_buf_size);
+
+ buf_list[i].buf_addr = to_dma_addr(data_hdls[i], 0);
+ buf_list[i].buf_size = data_buf_size;
+ }
+
+ drm_clflush_virt_range(buf_list, to_buf_size(list_hdl));
+
+ ret = aie->msg_ops.get_coredump(hwctx, list_hdl, num_bufs);
+ if (ret) {
+ XDNA_ERR(xdna, "Failed to get coredump from firmware, ret=%d",
+ ret);
+ goto free_data_hdls;
+ }
+
+ buf = u64_to_user_ptr(wa->array_args->buffer);
+ for (i = 0; i < num_bufs; i++) {
+ if (copy_to_user(buf, to_cpu_addr(data_hdls[i], 0),
+ data_buf_size)) {
+ ret = -EFAULT;
+ goto free_data_hdls;
+ }
+ buf += data_buf_size;
+ }
+
+free_data_hdls:
+ for (i = 0; i < num_bufs; i++)
+ amdxdna_free_msg_buffer(data_hdls[i]);
+ kfree(data_hdls);
+free_list_hdl:
+ amdxdna_free_msg_buffer(list_hdl);
+ return ret ? ret : 1;
+}
+
+int amdxdna_get_coredump(struct aie_device *aie,
+ struct amdxdna_drm_get_array *args)
+{
+ struct amdxdna_client *ctx_client = NULL;
+ struct amdxdna_drm_aie_coredump config;
+ struct amdxdna_dev *xdna = aie->xdna;
+ struct amdxdna_coredump_walk_arg wa;
+ int ret;
+
+ drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+
+ if (!aie->msg_ops.get_coredump) {
+ XDNA_DBG(xdna, "Get coredump unsupported");
+ return -EOPNOTSUPP;
+ }
+ if (args->num_element != 1) {
+ XDNA_ERR(xdna, "Invalid num_element %u, expected 1",
+ args->num_element);
+ return -EINVAL;
+ }
+
+ if (args->element_size < sizeof(config)) {
+ XDNA_ERR(xdna, "Invalid buffer size: %d", args->element_size);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(&config, u64_to_user_ptr(args->buffer), sizeof(config))) {
+ XDNA_ERR(xdna, "Failed to copy coredump config from user");
+ return -EFAULT;
+ }
+
+ if (XDNA_MBZ_DBG(xdna, &config.pad, sizeof(config.pad)))
+ return -EINVAL;
+
+ XDNA_DBG(xdna, "AIE Coredump request for context_id=%u pid=%llu",
+ config.context_id, config.pid);
+
+ wa.aie = aie;
+ wa.pid = config.pid;
+ wa.ctx_id = config.context_id;
+ wa.array_args = args;
+ amdxdna_for_each_client(xdna, ctx_client) {
+ ret = amdxdna_hwctx_walk(ctx_client, &wa, amdxdna_get_coredump_cb);
+ if (ret)
+ break;
+ }
+
+ if (!ret) {
+ XDNA_DBG(xdna, "Context not found");
+ return -EINVAL;
}
- dma_free_noncoherent(xdna->ddev.dev, size, cpu_addr, dma_addr, DMA_FROM_DEVICE);
+ return (ret > 0) ? 0 : ret;
}
diff --git a/drivers/accel/amdxdna/aie.h b/drivers/accel/amdxdna/aie.h
index 0483d582b7f8..d088030a2f31 100644
--- a/drivers/accel/amdxdna/aie.h
+++ b/drivers/accel/amdxdna/aie.h
@@ -15,6 +15,12 @@
struct psp_device;
struct smu_device;
+struct aie_msg_ops {
+ int (*get_coredump)(struct amdxdna_hwctx *hwctx,
+ struct amdxdna_msg_buf_hdl *list_hdl,
+ u32 num_bufs);
+};
+
struct aie_device {
struct amdxdna_dev *xdna;
struct mailbox_channel *mgmt_chann;
@@ -29,6 +35,7 @@ struct aie_device {
struct smu_device *smu_hdl;
struct amdxdna_drm_query_aie_metadata metadata;
+ struct aie_msg_ops msg_ops;
};
#define DECLARE_AIE_MSG(name, op) \
@@ -91,6 +98,23 @@ struct amdxdna_rev_vbnv {
const char *vbnv;
};
+struct amdxdna_coredump_buf_entry {
+ __u64 buf_addr;
+ __u32 buf_size;
+ __u32 reserved;
+} __packed;
+
+struct amdxdna_msg_buf_hdl {
+ struct amdxdna_dev *xdna;
+ void *vaddr;
+ dma_addr_t dma_addr;
+ u32 size;
+};
+
+#define to_cpu_addr(hdl, offset) ((hdl)->vaddr + (offset))
+#define to_dma_addr(hdl, offset) ((hdl)->dma_addr + (offset))
+#define to_buf_size(hdl) ((hdl)->size)
+
/* aie.c */
void aie_dump_mgmt_chann_debug(struct aie_device *aie);
void aie_destroy_chann(struct aie_device *aie, struct mailbox_channel **chann);
@@ -99,10 +123,9 @@ int aie_check_protocol(struct aie_device *aie, u32 fw_major, u32 fw_minor);
void amdxdna_vbnv_init(struct amdxdna_dev *xdna);
int amdxdna_get_metadata(struct aie_device *aie, struct amdxdna_client *client,
struct amdxdna_drm_get_info *args);
-void *amdxdna_alloc_msg_buffer(struct amdxdna_dev *xdna, u32 *size,
- dma_addr_t *dma_addr);
-void amdxdna_free_msg_buffer(struct amdxdna_dev *xdna, size_t size,
- void *cpu_addr, dma_addr_t dma_addr);
+struct amdxdna_msg_buf_hdl *amdxdna_alloc_msg_buffer(struct amdxdna_dev *xdna, u32 size);
+void amdxdna_free_msg_buffer(struct amdxdna_msg_buf_hdl *hdl);
+int amdxdna_get_coredump(struct aie_device *aie, struct amdxdna_drm_get_array *args);
/* aie_psp.c */
struct psp_device *aiem_psp_create(struct drm_device *ddev, struct psp_config *conf);
diff --git a/drivers/accel/amdxdna/aie2_error.c b/drivers/accel/amdxdna/aie2_error.c
index babdac0157ab..8030a3293e5d 100644
--- a/drivers/accel/amdxdna/aie2_error.c
+++ b/drivers/accel/amdxdna/aie2_error.c
@@ -30,9 +30,7 @@ struct async_event {
struct async_events {
struct workqueue_struct *wq;
- u8 *buf;
- dma_addr_t addr;
- u32 size;
+ struct amdxdna_msg_buf_hdl *hdl;
u32 event_cnt;
struct async_event event[] __counted_by(event_cnt);
};
@@ -339,7 +337,7 @@ void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
destroy_workqueue(events->wq);
mutex_lock(&xdna->dev_lock);
- amdxdna_free_msg_buffer(xdna, events->size, events->buf, events->addr);
+ amdxdna_free_msg_buffer(events->hdl);
kfree(events);
}
@@ -347,7 +345,6 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
{
struct amdxdna_dev *xdna = ndev->aie.xdna;
u32 total_col = ndev->total_col;
- u32 total_size = ASYNC_BUF_SIZE * total_col;
struct async_events *events;
int i, ret;
@@ -355,12 +352,11 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
if (!events)
return -ENOMEM;
- events->buf = amdxdna_alloc_msg_buffer(xdna, &total_size, &events->addr);
- if (IS_ERR(events->buf)) {
- ret = PTR_ERR(events->buf);
+ events->hdl = amdxdna_alloc_msg_buffer(xdna, ASYNC_BUF_SIZE * total_col);
+ if (IS_ERR(events->hdl)) {
+ ret = PTR_ERR(events->hdl);
goto free_events;
}
- events->size = total_size;
events->event_cnt = total_col;
events->wq = alloc_ordered_workqueue("async_wq", 0);
@@ -375,8 +371,8 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
e->ndev = ndev;
e->wq = events->wq;
- e->buf = &events->buf[offset];
- e->addr = events->addr + offset;
+ e->buf = to_cpu_addr(events->hdl, offset);
+ e->addr = to_dma_addr(events->hdl, offset);
e->size = ASYNC_BUF_SIZE;
e->resp.status = MAX_AIE2_STATUS_CODE;
INIT_WORK(&e->work, aie2_error_worker);
@@ -389,13 +385,13 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
ndev->async_events = events;
XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
- events->event_cnt, events->size);
+ events->event_cnt, to_buf_size(events->hdl));
return 0;
free_wq:
destroy_workqueue(events->wq);
free_buf:
- amdxdna_free_msg_buffer(xdna, events->size, events->buf, events->addr);
+ amdxdna_free_msg_buffer(events->hdl);
free_events:
kfree(events);
return ret;
diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
index c4b364801cc0..6d9f00742c22 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -368,28 +368,27 @@ int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf,
{
DECLARE_AIE_MSG(aie_column_info, MSG_OP_QUERY_COL_STATUS);
struct amdxdna_dev *xdna = ndev->aie.xdna;
- u32 buf_sz, aie_bitmap = 0;
+ struct amdxdna_msg_buf_hdl *buf_hdl;
struct amdxdna_client *client;
- dma_addr_t dma_addr;
- u8 *buff_addr;
+ u32 aie_bitmap = 0;
int ret;
- buf_sz = ndev->aie.metadata.cols * ndev->aie.metadata.col_size;
- buff_addr = amdxdna_alloc_msg_buffer(xdna, &buf_sz, &dma_addr);
- if (IS_ERR(buff_addr))
- return PTR_ERR(buff_addr);
+ buf_hdl = amdxdna_alloc_msg_buffer(xdna, ndev->aie.metadata.cols *
+ ndev->aie.metadata.col_size);
+ if (IS_ERR(buf_hdl))
+ return PTR_ERR(buf_hdl);
/* Go through each hardware context and mark the AIE columns that are active */
list_for_each_entry(client, &xdna->client_list, node)
amdxdna_hwctx_walk(client, &aie_bitmap, amdxdna_hwctx_col_map);
*cols_filled = 0;
- req.dump_buff_addr = dma_addr;
- req.dump_buff_size = buf_sz;
+ req.dump_buff_addr = to_dma_addr(buf_hdl, 0);
+ req.dump_buff_size = to_buf_size(buf_hdl);
req.num_cols = hweight32(aie_bitmap);
req.aie_bitmap = aie_bitmap;
- drm_clflush_virt_range(buff_addr, req.dump_buff_size); /* device can access */
+ drm_clflush_virt_range(to_cpu_addr(buf_hdl, 0), req.dump_buff_size);
ret = aie_send_mgmt_msg_wait(&ndev->aie, &msg);
if (ret) {
XDNA_ERR(xdna, "Error during NPU query, status %d", ret);
@@ -398,14 +397,15 @@ int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf,
XDNA_DBG(xdna, "Query NPU status completed");
- if (buf_sz < resp.size) {
+ if (to_buf_size(buf_hdl) < resp.size) {
ret = -EINVAL;
- XDNA_ERR(xdna, "Bad buffer size. Available: %u. Needs: %u", buf_sz, resp.size);
+ XDNA_ERR(xdna, "Bad buffer size. Available: %u. Needs: %u",
+ to_buf_size(buf_hdl), resp.size);
goto fail;
}
size = min(size, resp.size);
- if (copy_to_user(buf, buff_addr, size)) {
+ if (copy_to_user(buf, to_cpu_addr(buf_hdl, 0), size)) {
ret = -EFAULT;
XDNA_ERR(xdna, "Failed to copy NPU status to user space");
goto fail;
@@ -414,7 +414,7 @@ int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf,
*cols_filled = aie_bitmap;
fail:
- amdxdna_free_msg_buffer(xdna, buf_sz, buff_addr, dma_addr);
+ amdxdna_free_msg_buffer(buf_hdl);
return ret;
}
@@ -424,38 +424,36 @@ int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
{
DECLARE_AIE_MSG(get_telemetry, MSG_OP_GET_TELEMETRY);
struct amdxdna_dev *xdna = ndev->aie.xdna;
- dma_addr_t dma_addr;
- u32 buf_sz;
- u8 *addr;
+ struct amdxdna_msg_buf_hdl *buf_hdl;
int ret;
if (header->type >= MAX_TELEMETRY_TYPE)
return -EINVAL;
- buf_sz = min(size, SZ_4M);
- addr = amdxdna_alloc_msg_buffer(xdna, &buf_sz, &dma_addr);
- if (IS_ERR(addr))
- return PTR_ERR(addr);
+ buf_hdl = amdxdna_alloc_msg_buffer(xdna, min(size, SZ_4M));
+ if (IS_ERR(buf_hdl))
+ return PTR_ERR(buf_hdl);
- req.buf_addr = dma_addr;
- req.buf_size = buf_sz;
+ req.buf_addr = to_dma_addr(buf_hdl, 0);
+ req.buf_size = to_buf_size(buf_hdl);
req.type = header->type;
- drm_clflush_virt_range(addr, req.buf_size); /* device can access */
+ drm_clflush_virt_range(to_cpu_addr(buf_hdl, 0), req.buf_size);
ret = aie_send_mgmt_msg_wait(&ndev->aie, &msg);
if (ret) {
XDNA_ERR(xdna, "Query telemetry failed, status %d", ret);
goto free_buf;
}
- if (buf_sz < resp.size) {
+ if (to_buf_size(buf_hdl) < resp.size) {
ret = -EINVAL;
- XDNA_ERR(xdna, "Bad buffer size. Available: %u. Needs: %u", buf_sz, resp.size);
+ XDNA_ERR(xdna, "Bad buffer size. Available: %u. Needs: %u",
+ to_buf_size(buf_hdl), resp.size);
goto free_buf;
}
size = min(size, resp.size);
- if (copy_to_user(buf, addr, size)) {
+ if (copy_to_user(buf, to_cpu_addr(buf_hdl, 0), size)) {
ret = -EFAULT;
XDNA_ERR(xdna, "Failed to copy telemetry to user space");
goto free_buf;
@@ -465,7 +463,7 @@ int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
header->minor = resp.minor;
free_buf:
- amdxdna_free_msg_buffer(xdna, buf_sz, addr, dma_addr);
+ amdxdna_free_msg_buffer(buf_hdl);
return ret;
}
@@ -916,6 +914,9 @@ void aie2_msg_init(struct amdxdna_dev_hdl *ndev)
ndev->exec_msg_ops = &npu_exec_message_ops;
else
ndev->exec_msg_ops = &legacy_exec_message_ops;
+
+ if (AIE_FEATURE_ON(&ndev->aie, AIE2_GET_COREDUMP))
+ ndev->aie.msg_ops.get_coredump = aie2_get_aie_coredump;
}
static inline struct amdxdna_gem_obj *
@@ -1164,9 +1165,7 @@ int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
{
DECLARE_AIE_MSG(get_app_health, MSG_OP_GET_APP_HEALTH);
struct amdxdna_dev *xdna = ndev->aie.xdna;
- struct app_health_report *buf;
- dma_addr_t dma_addr;
- u32 buf_size;
+ struct amdxdna_msg_buf_hdl *buf_hdl;
int ret;
if (!AIE_FEATURE_ON(&ndev->aie, AIE2_APP_HEALTH)) {
@@ -1174,18 +1173,17 @@ int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
return -EOPNOTSUPP;
}
- buf_size = sizeof(*report);
- buf = amdxdna_alloc_msg_buffer(xdna, &buf_size, &dma_addr);
- if (IS_ERR(buf)) {
+ buf_hdl = amdxdna_alloc_msg_buffer(xdna, sizeof(*report));
+ if (IS_ERR(buf_hdl)) {
XDNA_ERR(xdna, "Failed to allocate buffer for app health");
- return PTR_ERR(buf);
+ return PTR_ERR(buf_hdl);
}
- req.buf_addr = dma_addr;
+ req.buf_addr = to_dma_addr(buf_hdl, 0);
req.context_id = context_id;
- req.buf_size = buf_size;
+ req.buf_size = to_buf_size(buf_hdl);
- drm_clflush_virt_range(buf, req.buf_size);
+ drm_clflush_virt_range(to_cpu_addr(buf_hdl, 0), req.buf_size);
ret = aie_send_mgmt_msg_wait(&ndev->aie, &msg);
if (ret) {
XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x", ret, resp.status);
@@ -1193,10 +1191,10 @@ int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
}
/* Copy the report to caller's buffer */
- memcpy(report, buf, sizeof(*report));
+ memcpy(report, to_cpu_addr(buf_hdl, 0), sizeof(*report));
free_buf:
- amdxdna_free_msg_buffer(xdna, buf_size, buf, dma_addr);
+ amdxdna_free_msg_buffer(buf_hdl);
return ret;
}
@@ -1268,3 +1266,29 @@ int aie2_get_dev_revision(struct amdxdna_dev_hdl *ndev, enum aie2_dev_revision *
XDNA_DBG(xdna, "Device revision: %d (raw fuse: 0x%x)", *rev, resp.raw_fuse_data);
return 0;
}
+
+int aie2_get_aie_coredump(struct amdxdna_hwctx *hwctx,
+ struct amdxdna_msg_buf_hdl *list_hdl,
+ u32 num_bufs)
+{
+ DECLARE_AIE_MSG(get_coredump, MSG_OP_GET_COREDUMP);
+ struct amdxdna_dev *xdna = hwctx->client->xdna;
+ struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
+ int ret;
+
+ req.context_id = hwctx->fw_ctx_id;
+ req.num_bufs = num_bufs;
+ req.list_addr = to_dma_addr(list_hdl, 0);
+ req.list_size = to_buf_size(list_hdl);
+
+ ret = aie_send_mgmt_msg_wait(&ndev->aie, &msg);
+ if (ret) {
+ if (resp.status == AIE2_STATUS_MGMT_ERT_DRAM_BUFFER_SIZE_INVALID)
+ XDNA_ERR(xdna, "Invalid buffer size(required 0x%x) for get coredump",
+ resp.required_buffer_size);
+ else
+ XDNA_ERR(xdna, "Get coredump got status 0x%x", resp.status);
+ }
+
+ return ret;
+}
diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h
index fd65a4236d49..4d8f9ebd7fa2 100644
--- a/drivers/accel/amdxdna/aie2_msg_priv.h
+++ b/drivers/accel/amdxdna/aie2_msg_priv.h
@@ -35,6 +35,7 @@ enum aie2_msg_opcode {
MSG_OP_GET_APP_HEALTH = 0x114,
MSG_OP_ADD_HOST_BUFFER = 0x115,
MSG_OP_GET_DEV_REVISION = 0x117,
+ MSG_OP_GET_COREDUMP = 0x119,
MSG_OP_MAX_DRV_OPCODE,
MSG_OP_GET_PROTOCOL_VERSION = 0x301,
MSG_OP_MAX_OPCODE
@@ -64,6 +65,7 @@ enum aie2_msg_status {
AIE2_STATUS_MGMT_ERT_ENTER_SUSPEND_FAILURE,
AIE2_STATUS_MGMT_ERT_BUSY,
AIE2_STATUS_MGMT_ERT_APPLICATION_ACTIVE,
+ AIE2_STATUS_MGMT_ERT_DRAM_BUFFER_SIZE_INVALID,
MAX_MGMT_ERT_STATUS_CODE,
/* APP ERT Error codes */
AIE2_STATUS_APP_ERT_FIRST_ERROR = 0x3000001,
@@ -543,4 +545,17 @@ struct get_dev_revision_resp {
__u32 raw_fuse_data;
} __packed;
+struct get_coredump_req {
+ __u32 context_id;
+ __u32 num_bufs;
+ __u64 list_addr;
+ __u32 list_size;
+} __packed;
+
+struct get_coredump_resp {
+ enum aie2_msg_status status;
+ __u32 required_buffer_size;
+ __u32 reserved[7];
+} __packed;
+
#endif /* _AIE2_MSG_PRIV_H_ */
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index c4d345d4c76b..7dea21a9e915 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -20,6 +20,7 @@
#include <linux/xarray.h>
#include <asm/hypervisor.h>
+#include "aie.h"
#include "aie2_msg_priv.h"
#include "aie2_pci.h"
#include "aie2_solver.h"
@@ -1088,6 +1089,7 @@ static int aie2_query_ctx_status_array(struct amdxdna_client *client,
static int aie2_get_array(struct amdxdna_client *client,
struct amdxdna_drm_get_array *args)
{
+ struct amdxdna_dev_hdl *ndev = client->xdna->dev_handle;
struct amdxdna_dev *xdna = client->xdna;
int ret, idx;
@@ -1105,6 +1107,9 @@ static int aie2_get_array(struct amdxdna_client *client,
case DRM_AMDXDNA_HW_LAST_ASYNC_ERR:
ret = aie2_get_array_async_error(xdna->dev_handle, args);
break;
+ case DRM_AMDXDNA_AIE_COREDUMP:
+ ret = amdxdna_get_coredump(&ndev->aie, args);
+ break;
case DRM_AMDXDNA_BO_USAGE:
ret = amdxdna_drm_get_bo_usage(&xdna->ddev, args);
break;
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 77648cc548b6..c3291882538c 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -202,6 +202,7 @@ enum aie2_fw_feature {
AIE2_ADD_HOST_BUFFER,
AIE2_UPDATE_PROPERTY,
AIE2_GET_DEV_REVISION,
+ AIE2_GET_COREDUMP,
AIE2_FEATURE_MAX
};
@@ -270,6 +271,9 @@ int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
struct app_health_report *report);
int aie2_get_dev_revision(struct amdxdna_dev_hdl *ndev, enum aie2_dev_revision *rev);
+int aie2_get_aie_coredump(struct amdxdna_hwctx *hwctx,
+ struct amdxdna_msg_buf_hdl *list_hdl,
+ u32 num_bufs);
int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
diff --git a/drivers/accel/amdxdna/aie4_message.c b/drivers/accel/amdxdna/aie4_message.c
index 88037edbb02a..795b65ee9e0e 100644
--- a/drivers/accel/amdxdna/aie4_message.c
+++ b/drivers/accel/amdxdna/aie4_message.c
@@ -5,14 +5,17 @@
#include <drm/amdxdna_accel.h>
#include <drm/drm_print.h>
+#include <linux/bitfield.h>
#include <linux/mutex.h>
#include "aie.h"
#include "aie4_msg_priv.h"
#include "aie4_pci.h"
+#include "amdxdna_ctx.h"
#include "amdxdna_mailbox.h"
#include "amdxdna_mailbox_helper.h"
#include "amdxdna_pci_drv.h"
+#include "amdxdna_pm.h"
int aie4_suspend_fw(struct amdxdna_dev_hdl *ndev)
{
@@ -70,7 +73,7 @@ int aie4_attach_work_buffer(struct amdxdna_dev_hdl *ndev)
struct amdxdna_dev *xdna = ndev->aie.xdna;
int ret;
- req.buff_addr = ndev->work_buf_addr;
+ req.buff_addr = to_dma_addr(ndev->work_buf_hdl, 0);
req.buff_size = AIE4_WORK_BUFFER_MIN_SIZE;
ret = aie_send_mgmt_msg_wait(&ndev->aie, &msg);
@@ -81,3 +84,31 @@ int aie4_attach_work_buffer(struct amdxdna_dev_hdl *ndev)
return ret;
}
+
+int aie4_get_aie_coredump(struct amdxdna_hwctx *hwctx,
+ struct amdxdna_msg_buf_hdl *list_hdl,
+ u32 num_bufs)
+{
+ DECLARE_AIE_MSG(aie4_msg_aie_coredump, AIE4_MSG_OP_AIE_COREDUMP);
+ struct amdxdna_dev *xdna = hwctx->client->xdna;
+ struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
+ int ret;
+
+ req.context_id = hwctx->fw_ctx_id;
+ req.pasid = FIELD_PREP(AIE4_MSG_PASID, hwctx->client->pasid) |
+ FIELD_PREP(AIE4_MSG_PASID_VLD, 1);
+ req.num_buffers = num_bufs;
+ req.buffer_list_addr = to_dma_addr(list_hdl, 0);
+
+ ret = aie_send_mgmt_msg_wait(&ndev->aie, &msg);
+ if (ret)
+ XDNA_ERR(xdna, "Get coredump got status 0x%x", resp.status);
+
+ return ret;
+}
+
+void aie4_msg_init(struct amdxdna_dev_hdl *ndev)
+{
+ if (AIE_FEATURE_ON(&ndev->aie, AIE4_GET_COREDUMP))
+ ndev->aie.msg_ops.get_coredump = aie4_get_aie_coredump;
+}
diff --git a/drivers/accel/amdxdna/aie4_msg_priv.h b/drivers/accel/amdxdna/aie4_msg_priv.h
index af0866045b91..d01b2889af1d 100644
--- a/drivers/accel/amdxdna/aie4_msg_priv.h
+++ b/drivers/accel/amdxdna/aie4_msg_priv.h
@@ -12,6 +12,7 @@
enum aie4_msg_opcode {
AIE4_MSG_OP_SUSPEND = 0x10003,
AIE4_MSG_OP_ATTACH_WORK_BUFFER = 0x1000D,
+ AIE4_MSG_OP_AIE_COREDUMP = 0x30010,
AIE4_MSG_OP_CREATE_VFS = 0x20001,
AIE4_MSG_OP_DESTROY_VFS = 0x20002,
@@ -144,4 +145,16 @@ struct aie4_msg_attach_work_buffer_resp {
enum aie4_msg_status status;
} __packed;
+struct aie4_msg_aie_coredump_req {
+ __u32 context_id;
+ __u32 pasid;
+ __u32 num_buffers;
+ __u32 resvd;
+ __u64 buffer_list_addr;
+} __packed;
+
+struct aie4_msg_aie_coredump_resp {
+ enum aie4_msg_status status;
+} __packed;
+
#endif /* _AIE4_MSG_PRIV_H_ */
diff --git a/drivers/accel/amdxdna/aie4_pci.c b/drivers/accel/amdxdna/aie4_pci.c
index a58a83af42a4..1b5584d6e539 100644
--- a/drivers/accel/amdxdna/aie4_pci.c
+++ b/drivers/accel/amdxdna/aie4_pci.c
@@ -4,6 +4,7 @@
*/
#include <drm/amdxdna_accel.h>
+#include <drm/drm_drv.h>
#include <drm/drm_managed.h>
#include <drm/drm_print.h>
#include <linux/firmware.h>
@@ -15,6 +16,7 @@
#include "amdxdna_mailbox.h"
#include "amdxdna_mailbox_helper.h"
#include "amdxdna_pci_drv.h"
+#include "amdxdna_pm.h"
#define NO_IOHUB 0
#define PSP_NOTIFY_INTR 0xD007BE11
@@ -512,6 +514,7 @@ static int aie4m_pcidev_init(struct amdxdna_dev *xdna)
if (ret)
return ret;
+ aie4_msg_init(ndev);
amdxdna_vbnv_init(xdna);
XDNA_DBG(xdna, "init finished");
@@ -573,35 +576,60 @@ static int aie4_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_i
static int aie4_alloc_work_buffer(struct amdxdna_dev_hdl *ndev)
{
struct amdxdna_dev *xdna = ndev->aie.xdna;
- u32 buf_size = AIE4_WORK_BUFFER_MIN_SIZE;
- ndev->work_buf = amdxdna_alloc_msg_buffer(xdna, &buf_size,
- &ndev->work_buf_addr);
- if (IS_ERR(ndev->work_buf)) {
- int ret = PTR_ERR(ndev->work_buf);
+ ndev->work_buf_hdl = amdxdna_alloc_msg_buffer(xdna, AIE4_WORK_BUFFER_MIN_SIZE);
+ if (IS_ERR(ndev->work_buf_hdl)) {
+ int ret = PTR_ERR(ndev->work_buf_hdl);
XDNA_ERR(xdna, "Failed to alloc work buffer, size 0x%x",
AIE4_WORK_BUFFER_MIN_SIZE);
- ndev->work_buf = NULL;
+ ndev->work_buf_hdl = NULL;
return ret;
}
- ndev->work_buf_size = buf_size;
- XDNA_DBG(xdna, "Work buffer allocated: size 0x%x", buf_size);
+ XDNA_DBG(xdna, "Work buffer allocated: size 0x%x",
+ to_buf_size(ndev->work_buf_hdl));
return 0;
}
static void aie4_free_work_buffer(struct amdxdna_dev_hdl *ndev)
{
- struct amdxdna_dev *xdna = ndev->aie.xdna;
-
- if (!ndev->work_buf)
+ if (!ndev->work_buf_hdl)
return;
- amdxdna_free_msg_buffer(xdna, ndev->work_buf_size, ndev->work_buf,
- ndev->work_buf_addr);
- ndev->work_buf = NULL;
+ amdxdna_free_msg_buffer(ndev->work_buf_hdl);
+ ndev->work_buf_hdl = NULL;
+}
+
+static int aie4_get_array(struct amdxdna_client *client,
+ struct amdxdna_drm_get_array *args)
+{
+ struct amdxdna_dev_hdl *ndev = client->xdna->dev_handle;
+ struct amdxdna_dev *xdna = client->xdna;
+ int ret, idx;
+
+ if (!drm_dev_enter(&xdna->ddev, &idx))
+ return -ENODEV;
+
+ ret = amdxdna_pm_resume_get_locked(xdna);
+ if (ret)
+ goto dev_exit;
+
+ switch (args->param) {
+ case DRM_AMDXDNA_AIE_COREDUMP:
+ ret = amdxdna_get_coredump(&ndev->aie, args);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ amdxdna_pm_suspend_put(xdna);
+
+dev_exit:
+ drm_dev_exit(idx);
+ return ret;
}
static int aie4_pf_init(struct amdxdna_dev *xdna)
@@ -664,4 +692,5 @@ const struct amdxdna_dev_ops aie4_vf_ops = {
.mmap = aie4_doorbell_mmap,
.cmd_wait = aie4_cmd_wait,
.get_aie_info = aie4_get_info,
+ .get_array = aie4_get_array,
};
diff --git a/drivers/accel/amdxdna/aie4_pci.h b/drivers/accel/amdxdna/aie4_pci.h
index 3fd5eace3ed7..f5efeaf1de38 100644
--- a/drivers/accel/amdxdna/aie4_pci.h
+++ b/drivers/accel/amdxdna/aie4_pci.h
@@ -54,16 +54,23 @@ struct amdxdna_dev_hdl {
struct xarray cert_comp_xa; /* device level indexed by msix id */
struct mutex cert_comp_lock; /* protects cert_comp operations*/
- void *work_buf;
- dma_addr_t work_buf_addr;
- u32 work_buf_size;
+ struct amdxdna_msg_buf_hdl *work_buf_hdl;
+};
+
+enum aie4_fw_feature {
+ AIE4_GET_COREDUMP,
+ AIE4_FEATURE_MAX
};
/* aie4_message.c */
+void aie4_msg_init(struct amdxdna_dev_hdl *ndev);
int aie4_query_aie_metadata(struct amdxdna_dev_hdl *ndev,
struct amdxdna_drm_query_aie_metadata *metadata);
int aie4_suspend_fw(struct amdxdna_dev_hdl *ndev);
int aie4_attach_work_buffer(struct amdxdna_dev_hdl *ndev);
+int aie4_get_aie_coredump(struct amdxdna_hwctx *hwctx,
+ struct amdxdna_msg_buf_hdl *list_hdl,
+ u32 num_bufs);
/* aie4_ctx.c */
int aie4_hwctx_init(struct amdxdna_hwctx *hwctx);
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
index aaae16430466..4b8f54a3e35f 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -6,6 +6,7 @@
#ifndef _AMDXDNA_CTX_H_
#define _AMDXDNA_CTX_H_
+#include <drm/gpu_scheduler.h>
#include <linux/bitfield.h>
#include "amdxdna_gem.h"
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
index c677293c1ae7..d4e320518a37 100644
--- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
@@ -41,9 +41,10 @@ MODULE_FIRMWARE("amdnpu/17f0_11/npu_7.sbin");
* 0.8: Support BO usage query
* 0.9: Add new device type AMDXDNA_DEV_TYPE_PF
* 0.10: Support AIE4 UMQ
+ * 0.11: Support AIE coredump
*/
#define AMDXDNA_DRIVER_MAJOR 0
-#define AMDXDNA_DRIVER_MINOR 10
+#define AMDXDNA_DRIVER_MINOR 11
/*
* Bind the driver base on (vendor_id, device_id) pair and later use the
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.h b/drivers/accel/amdxdna/amdxdna_pci_drv.h
index 34271c14d359..b6a6578e8404 100644
--- a/drivers/accel/amdxdna/amdxdna_pci_drv.h
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.h
@@ -47,6 +47,7 @@ struct amdxdna_drm_set_state;
struct amdxdna_gem_obj;
struct amdxdna_hwctx;
struct amdxdna_sched_job;
+struct amdxdna_msg_buf_hdl;
/*
* struct amdxdna_dev_ops - Device hardware operation callbacks
@@ -173,6 +174,9 @@ struct amdxdna_client {
#define amdxdna_for_each_hwctx(client, hwctx_id, entry) \
xa_for_each(&(client)->hwctx_xa, hwctx_id, entry)
+#define amdxdna_for_each_client(xdna, client) \
+ list_for_each_entry(client, &(xdna)->client_list, node)
+
/* Add device info below */
extern const struct amdxdna_dev_info dev_npu1_info;
extern const struct amdxdna_dev_info dev_npu3_pf_info;
diff --git a/drivers/accel/amdxdna/npu3_regs.c b/drivers/accel/amdxdna/npu3_regs.c
index d76b2e99c308..e487960e3ccf 100644
--- a/drivers/accel/amdxdna/npu3_regs.c
+++ b/drivers/accel/amdxdna/npu3_regs.c
@@ -39,6 +39,7 @@
static const struct amdxdna_fw_feature_tbl npu3_fw_feature_table[] = {
{ .major = 5, .min_minor = 10 },
+ { .features = BIT_U64(AIE4_GET_COREDUMP), .major = 5, .min_minor = 24 },
{ 0 }
};
diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/amdxdna_accel.h
index 51a507561df6..db583de58ebb 100644
--- a/include/uapi/drm/amdxdna_accel.h
+++ b/include/uapi/drm/amdxdna_accel.h
@@ -635,11 +635,24 @@ struct amdxdna_drm_bo_usage {
__u64 heap_usage;
};
+/**
+ * struct amdxdna_drm_aie_coredump - The data for AIE coredump
+ */
+struct amdxdna_drm_aie_coredump {
+ /** @pid: The Process ID of the process that created this context.*/
+ __u64 pid;
+ /** @context_id: Context ID. */
+ __u32 context_id;
+ /** @pad: MBZ. */
+ __u32 pad;
+};
+
/*
* Supported params in struct amdxdna_drm_get_array
*/
#define DRM_AMDXDNA_HW_CONTEXT_ALL 0
#define DRM_AMDXDNA_HW_LAST_ASYNC_ERR 2
+#define DRM_AMDXDNA_AIE_COREDUMP 5
#define DRM_AMDXDNA_BO_USAGE 6
/**
@@ -657,6 +670,19 @@ struct amdxdna_drm_get_array {
* %DRM_AMDXDNA_HW_LAST_ASYNC_ERR:
* Returns last async error.
*
+ * %DRM_AMDXDNA_AIE_COREDUMP:
+ * Returns AIE tile memory dump for a hardware context.
+ *
+ * Input: num_element must be 1. buffer points to a user buffer whose
+ * size is element_size bytes. The first sizeof(struct
+ * amdxdna_drm_aie_coredump) bytes of buffer carry the request
+ * (pid + context_id). On success the driver writes rows * cols * 1 MB
+ * of tile dump data into buffer. If the buffer is too small the
+ * driver sets element_size to the required size and returns -ENOSPC.
+ *
+ * Access: context owners may coredump their own contexts;
+ * CAP_SYS_ADMIN may coredump any context.
+ *
* %DRM_AMDXDNA_BO_USAGE:
* Returns usage of heap/internal/external BOs.
*/
--
2.34.1
^ permalink raw reply related [flat|nested] 3+ messages in thread* Claude review: accel/amdxdna: add AIE coredump support
2026-05-26 17:29 [PATCH V1] accel/amdxdna: add AIE coredump support Lizhi Hou
@ 2026-05-27 4:28 ` Claude Code Review Bot
2026-05-27 4:28 ` Claude Code Review Bot
1 sibling, 0 replies; 3+ messages in thread
From: Claude Code Review Bot @ 2026-05-27 4:28 UTC (permalink / raw)
To: dri-devel-reviews
Overall Series Review
Subject: accel/amdxdna: add AIE coredump support
Author: Lizhi Hou <lizhi.hou@amd.com>
Patches: 1
Reviewed: 2026-05-27T14:28:07.674734
---
This is a single patch adding AIE coredump support to the `accel/amdxdna` driver, allowing per-context tile memory snapshots for post-mortem hardware debugging. The patch mixes two distinct changes: (1) refactoring the `amdxdna_alloc_msg_buffer`/`amdxdna_free_msg_buffer` API to use an opaque handle (`amdxdna_msg_buf_hdl`), and (2) adding the coredump ioctl and firmware message plumbing. These should arguably be separate patches for easier review and bisection.
The overall design is reasonable: the coredump walks all clients and hardware contexts under `dev_lock` + SRCU protection, sends a firmware message, and copies results to userspace. The permission model (own-context or `CAP_SYS_ADMIN`) is appropriate.
There are several concerns, ranging from a potential bug in the zero-buffers edge case to UAPI documentation issues, detailed below.
---
---
Generated by Claude Code Patch Reviewer
^ permalink raw reply [flat|nested] 3+ messages in thread
* Claude review: accel/amdxdna: add AIE coredump support
2026-05-26 17:29 [PATCH V1] accel/amdxdna: add AIE coredump support Lizhi Hou
2026-05-27 4:28 ` Claude review: " Claude Code Review Bot
@ 2026-05-27 4:28 ` Claude Code Review Bot
1 sibling, 0 replies; 3+ messages in thread
From: Claude Code Review Bot @ 2026-05-27 4:28 UTC (permalink / raw)
To: dri-devel-reviews
Patch Review
**1. Should be split into two patches**
This patch combines a mechanical API refactor (buffer handle abstraction) with new functionality (coredump ioctl). The buffer handle refactoring touches `aie2_error.c`, `aie2_message.c`, `aie4_pci.c`, and converts every existing caller of `amdxdna_alloc_msg_buffer` / `amdxdna_free_msg_buffer`. The coredump feature adds the new ioctl parameter, message types, and the coredump logic. Splitting these would make bisection and review much easier.
**2. Zero-buffer edge case in `amdxdna_get_coredump_cb`**
In `aie.c`, the number of buffers is computed as:
```c
num_bufs = (hwctx->num_col - hwctx->num_unused_col) * aie->metadata.rows;
```
If `num_col == num_unused_col`, `num_bufs` is 0, leading to `total_size = 0`. The `element_size < total_size` check passes trivially, then we allocate a 0-element `buf_list` and 0-element `data_hdls` array, and send a firmware message with `num_bufs = 0`. This is likely not the intended behavior. A guard for `num_bufs == 0` should be added, returning an appropriate error or short-circuiting.
**3. Large kernel memory allocation without limits**
Each data buffer is 1MB:
```c
size_t data_buf_size = SZ_1M;
```
These are allocated in a loop:
```c
for (i = 0; i < num_bufs; i++) {
data_hdls[i] = amdxdna_alloc_msg_buffer(xdna, data_buf_size);
```
`num_bufs` is `(num_col - num_unused_col) * metadata.rows`. While these values are hardware-determined, there is no explicit cap on total allocation. If a context has many active columns and rows, this could result in a very large kernel memory allocation. Consider adding a sanity check or upper bound on `num_bufs` or `total_size`, or documenting why the hardware-imposed limits are sufficient.
**4. Permission check uses euid comparison, not pid**
The permission check in `amdxdna_get_coredump_cb`:
```c
if (!capable(CAP_SYS_ADMIN) &&
!uid_eq(current_euid(), hwctx->client->filp->filp->f_cred->euid)) {
XDNA_ERR(xdna, "Permission denied for context %u", wa->ctx_id);
return -EPERM;
}
```
This checks the effective UID of the caller against the UID of the DRM file that created the context. Note that the context is already filtered by `pid` and `ctx_id` earlier in the same function:
```c
if (hwctx->client->pid != wa->pid || hwctx->id != wa->ctx_id)
return 0;
```
The PID match is based on user-supplied `config.pid`, which the user can set to any value. The real access control relies on the euid comparison, which is appropriate. However, this means any process running under the same UID (not just the owning process) can coredump the context, which may be intentional but worth documenting explicitly.
**5. UAPI: buffer serves dual purpose (input and output) without clear documentation**
The `DRM_AMDXDNA_AIE_COREDUMP` documentation says:
```
Input: num_element must be 1. buffer points to a user buffer whose
size is element_size bytes. The first sizeof(struct
amdxdna_drm_aie_coredump) bytes of buffer carry the request
(pid + context_id). On success the driver writes rows * cols * 1 MB
of tile dump data into buffer.
```
So `buffer` is used as both input (first 16 bytes for the config struct) and output (overwritten with coredump data). The `element_size` field has dual meaning: on input it must be large enough for the config *and* large enough for the output; on -ENOSPC the driver writes back the required output size, which will be much larger than `sizeof(amdxdna_drm_aie_coredump)`. This pattern works but is unusual. The documentation should clarify more explicitly that the config is consumed first and the buffer is then fully overwritten.
**6. `amdxdna_coredump_buf_entry` uses UAPI types in a kernel-internal header**
```c
struct amdxdna_coredump_buf_entry {
__u64 buf_addr;
__u32 buf_size;
__u32 reserved;
} __packed;
```
This struct in `aie.h` is kernel-internal (used for firmware communication), but uses `__u64`/`__u32` (UAPI types) instead of kernel `u64`/`u32`. Minor style issue — kernel-internal structs should use the kernel types.
**7. Error semantics change in `amdxdna_alloc_msg_buffer`**
The old code returned `ERR_PTR(-EINVAL)` when `order > MAX_PAGE_ORDER`; the new code returns `ERR_PTR(-ENOMEM)`:
```c
if (order > MAX_PAGE_ORDER)
goto free_hdl;
...
free_hdl:
kfree(hdl);
return ERR_PTR(-ENOMEM);
```
ENOMEM is arguably more appropriate, but this is a behavioral change for existing callers. If any caller distinguishes EINVAL from ENOMEM, this could cause issues. In practice, all callers just check `IS_ERR()`, so this is fine but worth noting.
**8. `to_buf_size` format specifier mismatch**
In `aie2_error.c`, the log format uses `%x` with `to_buf_size()`:
```c
XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
events->event_cnt, to_buf_size(events->hdl));
```
`to_buf_size()` returns `hdl->size` which is `u32`, so `%x` is correct. No issue here.
**9. `aie4_get_array` only handles coredump**
The new `aie4_get_array` function only handles `DRM_AMDXDNA_AIE_COREDUMP`:
```c
switch (args->param) {
case DRM_AMDXDNA_AIE_COREDUMP:
ret = amdxdna_get_coredump(&ndev->aie, args);
break;
default:
ret = -EOPNOTSUPP;
break;
}
```
This is being added to `aie4_vf_ops`. Previously VFs had no `get_array` at all (would return `-EOPNOTSUPP` from the ioctl handler). Now VFs only support coredump. This seems intentional but if other `get_array` parameters should be supported on VFs in the future, they'll need to be added here too.
**10. `amdxdna_drm_aie_coredump` struct: `pid` is `__u64` but kernel `pid_t` is 32-bit**
```c
struct amdxdna_drm_aie_coredump {
__u64 pid;
__u32 context_id;
__u32 pad;
};
```
`pid` is `__u64` in the UAPI struct, but `amdxdna_client.pid` is `pid_t` (32-bit `int`). The comparison `hwctx->client->pid != wa->pid` promotes the `pid_t` to `u64` correctly, so there's no functional bug. However, using `__u64` for a PID is unusual — `__u32` would be more conventional and match the kernel type width. Since this is a new UAPI, it should be correct from the start. Consider using `__u32` for `pid` and adding a second `__u32 pad2` or restructuring. Alternatively, the `__u64` pid could be intentional for alignment or future-proofing, but it should be documented.
**11. Minor: `#include <linux/cred.h>` may not be needed**
In `aie.c`, `#include <linux/cred.h>` is added for `current_euid()` and `uid_eq()`. These are typically available through other headers that are likely already included transitively. Not a correctness issue, but explicit includes for what you use is good practice.
**12. Feature table entry in `npu3_regs.c`**
```c
static const struct amdxdna_fw_feature_tbl npu3_fw_feature_table[] = {
{ .major = 5, .min_minor = 10 },
{ .features = BIT_U64(AIE4_GET_COREDUMP), .major = 5, .min_minor = 24 },
{ 0 }
};
```
This correctly gates the coredump feature behind firmware version 5.24+ for NPU3. The sentinel `{ 0 }` terminates the table. This looks correct.
---
**Summary**: The patch is functionally sound for the main path but should (a) be split into refactoring and feature patches, (b) handle the `num_bufs == 0` edge case, and (c) consider whether `__u64` for `pid` in the UAPI is the right choice before the ABI is frozen. The permission model and locking are correct.
---
Generated by Claude Code Patch Reviewer
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2026-05-27 4:28 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-26 17:29 [PATCH V1] accel/amdxdna: add AIE coredump support Lizhi Hou
2026-05-27 4:28 ` Claude review: " Claude Code Review Bot
2026-05-27 4:28 ` Claude Code Review Bot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox