* [PATCH V2] accel/amdxdna: Add expandable device heap support
@ 2026-05-15 16:19 Lizhi Hou
2026-05-15 21:01 ` Mario Limonciello
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Lizhi Hou @ 2026-05-15 16:19 UTC (permalink / raw)
To: ogabbay, quic_jhugo, dri-devel, mario.limonciello,
karol.wachowski
Cc: Lizhi Hou, linux-kernel, max.zhen, sonal.santan, Wendy Liang
Introduce an expandable device heap to avoid allocating a large heap
upfront. Start with a smaller initial heap and grow it on demand.
Return -EAGAIN when BO allocation fails due to insufficient heap space,
allowing userspace to trigger heap expansion via a heap BO creation
IOCTL and retry the allocation.
Manage heap chunks using an xarray. On expansion, register new chunks
with the firmware via MSG_OP_ADD_HOST_BUFFER.
Since heap shrinking is not supported by the firmware, release all heap
chunks on device close.
Co-developed-by: Wendy Liang <wendy.liang@amd.com>
Signed-off-by: Wendy Liang <wendy.liang@amd.com>
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
v2:
Fix memory leak when initializing hardware context failed.
For flush BO, only flush the requested area.
drivers/accel/amdxdna/aie2_ctx.c | 45 +++-
drivers/accel/amdxdna/aie2_message.c | 52 +++-
drivers/accel/amdxdna/aie2_msg_priv.h | 1 +
drivers/accel/amdxdna/aie2_pci.c | 1 +
drivers/accel/amdxdna/aie2_pci.h | 8 +-
drivers/accel/amdxdna/amdxdna_ctx.c | 88 ++++++-
drivers/accel/amdxdna/amdxdna_ctx.h | 2 +
drivers/accel/amdxdna/amdxdna_gem.c | 319 ++++++++++++++++++------
drivers/accel/amdxdna/amdxdna_gem.h | 16 +-
drivers/accel/amdxdna/amdxdna_pci_drv.c | 12 +-
drivers/accel/amdxdna/amdxdna_pci_drv.h | 8 +-
drivers/accel/amdxdna/npu1_regs.c | 1 +
drivers/accel/amdxdna/npu4_regs.c | 2 +
drivers/accel/amdxdna/npu5_regs.c | 1 +
drivers/accel/amdxdna/npu6_regs.c | 1 +
15 files changed, 460 insertions(+), 97 deletions(-)
diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index 7d6094aefb6f..658a5fb1fda6 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -91,6 +91,7 @@ static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwct
static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx)
{
struct amdxdna_gem_obj *heap = hwctx->priv->heap;
+ unsigned long heap_id;
int ret;
ret = aie2_create_context(xdna->dev_handle, hwctx);
@@ -107,6 +108,17 @@ static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hw
goto out;
}
+ xa_for_each_range(&hwctx->client->dev_heap_xa, heap_id, heap, 1,
+ hwctx->last_attached_heap) {
+ ret = aie2_add_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
+ amdxdna_obj_dma_addr(heap),
+ heap->mem.size);
+ if (ret) {
+ XDNA_ERR(xdna, "Add heap %ld failed ret %d", heap_id, ret);
+ goto out;
+ }
+ }
+
ret = aie2_config_cu(hwctx, NULL);
if (ret) {
XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
@@ -650,7 +662,7 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
hwctx->priv = priv;
mutex_lock(&client->mm_lock);
- heap = client->dev_heap;
+ heap = xa_load(&client->dev_heap_xa, 0);
if (!heap) {
XDNA_ERR(xdna, "The client dev heap object not exist");
mutex_unlock(&client->mm_lock);
@@ -732,6 +744,12 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
goto release_resource;
}
+ ret = amdxdna_update_heap(client, hwctx);
+ if (ret) {
+ XDNA_ERR(xdna, "Update heap failed, ret %d", ret);
+ goto release_resource;
+ }
+
ret = aie2_ctx_syncobj_create(hwctx);
if (ret) {
XDNA_ERR(xdna, "Create syncobj failed, ret %d", ret);
@@ -1161,3 +1179,28 @@ void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo,
else if (ret == -ERESTARTSYS)
XDNA_DBG(xdna, "Wait for bo interrupted by signal");
}
+
+int aie2_hwctx_heap_expand(struct amdxdna_hwctx *hwctx,
+ struct amdxdna_gem_obj *heap)
+{
+ struct amdxdna_client *client = hwctx->client;
+ struct amdxdna_dev *xdna = client->xdna;
+ u64 addr;
+ int ret;
+
+ ret = amdxdna_pm_resume_get_locked(xdna);
+ if (ret)
+ return ret;
+
+ addr = amdxdna_obj_dma_addr(heap);
+ ret = aie2_add_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
+ addr, heap->mem.size);
+ if (ret) {
+ XDNA_ERR(xdna, "Add heap failed hwctx %s 0x%lx ret %d",
+ hwctx->name, heap->mem.size, ret);
+ }
+
+ amdxdna_pm_suspend_put(xdna);
+
+ return ret;
+}
diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
index 1504295a21e4..c4b364801cc0 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -301,25 +301,59 @@ int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwc
return ret;
}
-int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size)
+static int aie2_send_host_buf_msgs(struct amdxdna_dev_hdl *ndev, u32 context_id,
+ u64 addr, u64 size, u32 initial_opcode)
{
DECLARE_AIE_MSG(map_host_buffer, MSG_OP_MAP_HOST_BUFFER);
struct amdxdna_dev *xdna = ndev->aie.xdna;
+ size_t chunk_size;
int ret;
- req.context_id = context_id;
- req.buf_addr = addr;
- req.buf_size = size;
- ret = aie_send_mgmt_msg_wait(&ndev->aie, &msg);
- if (ret)
- return ret;
+ chunk_size = xdna->dev_info->dev_mem_size;
+ if (!size || !IS_ALIGNED(size, chunk_size)) {
+ XDNA_ERR(xdna, "Invalid size 0x%llx for chunk 0x%lx",
+ size, chunk_size);
+ return -EINVAL;
+ }
- XDNA_DBG(xdna, "fw ctx %d map host buf addr 0x%llx size 0x%llx",
- context_id, addr, size);
+ msg.opcode = initial_opcode;
+ do {
+ req.context_id = context_id;
+ req.buf_addr = addr;
+ req.buf_size = chunk_size;
+ ret = aie_send_mgmt_msg_wait(&ndev->aie, &msg);
+ if (ret) {
+ XDNA_ERR(xdna, "fw ctx %d addr 0x%llx size 0x%lx",
+ context_id, addr, chunk_size);
+ return ret;
+ }
+
+ XDNA_DBG(xdna, "fw ctx %d host buf op 0x%x addr 0x%llx size 0x%lx",
+ context_id, msg.opcode, addr, chunk_size);
+
+ addr += chunk_size;
+ size -= chunk_size;
+ msg.opcode = MSG_OP_ADD_HOST_BUFFER;
+ } while (size);
return 0;
}
+int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size)
+{
+ return aie2_send_host_buf_msgs(ndev, context_id, addr, size,
+ MSG_OP_MAP_HOST_BUFFER);
+}
+
+int aie2_add_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size)
+{
+ if (!AIE_FEATURE_ON(&ndev->aie, AIE2_ADD_HOST_BUFFER))
+ return -EOPNOTSUPP;
+
+ return aie2_send_host_buf_msgs(ndev, context_id, addr, size,
+ MSG_OP_ADD_HOST_BUFFER);
+}
+
static int amdxdna_hwctx_col_map(struct amdxdna_hwctx *hwctx, void *arg)
{
u32 *bitmap = arg;
diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h
index a41c9797e265..fd65a4236d49 100644
--- a/drivers/accel/amdxdna/aie2_msg_priv.h
+++ b/drivers/accel/amdxdna/aie2_msg_priv.h
@@ -33,6 +33,7 @@ enum aie2_msg_opcode {
MSG_OP_REGISTER_ASYNC_EVENT_MSG = 0x10C,
MSG_OP_UPDATE_PROPERTY = 0x113,
MSG_OP_GET_APP_HEALTH = 0x114,
+ MSG_OP_ADD_HOST_BUFFER = 0x115,
MSG_OP_GET_DEV_REVISION = 0x117,
MSG_OP_MAX_DRV_OPCODE,
MSG_OP_GET_PROTOCOL_VERSION = 0x301,
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index 6c8a0f70b73d..c4d345d4c76b 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -1241,4 +1241,5 @@ const struct amdxdna_dev_ops aie2_ops = {
.hmm_invalidate = aie2_hmm_invalidate,
.get_array = aie2_get_array,
.get_dev_revision = aie2_get_dev_rev,
+ .hwctx_heap_expand = aie2_hwctx_heap_expand,
};
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 84bdb3f8b8f9..77648cc548b6 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -15,8 +15,9 @@
#include "amdxdna_mailbox.h"
/* Firmware determines device memory base address and size */
-#define AIE2_DEVM_BASE 0x4000000
-#define AIE2_DEVM_SIZE SZ_64M
+#define AIE2_DEVM_BASE 0x4000000
+#define AIE2_DEVM_SIZE SZ_64M
+#define AIE2_DEVM_MAX_SIZE SZ_512M
#define NDEV2PDEV(ndev) (to_pci_dev((ndev)->aie.xdna->ddev.dev))
@@ -198,6 +199,7 @@ enum aie2_fw_feature {
AIE2_PREEMPT,
AIE2_TEMPORAL_ONLY,
AIE2_APP_HEALTH,
+ AIE2_ADD_HOST_BUFFER,
AIE2_UPDATE_PROPERTY,
AIE2_GET_DEV_REVISION,
AIE2_FEATURE_MAX
@@ -271,6 +273,7 @@ int aie2_get_dev_revision(struct amdxdna_dev_hdl *ndev, enum aie2_dev_revision *
int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
+int aie2_add_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf, u32 size, u32 *cols_filled);
int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
char __user *buf, u32 size,
@@ -302,5 +305,6 @@ void aie2_hwctx_suspend(struct amdxdna_client *client);
int aie2_hwctx_resume(struct amdxdna_client *client);
int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
+int aie2_hwctx_heap_expand(struct amdxdna_hwctx *hwctx, struct amdxdna_gem_obj *heap);
#endif /* _AIE2_PCI_H_ */
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
index b79229a63af3..34466916a6ab 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.c
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -61,16 +61,35 @@ static struct dma_fence *amdxdna_fence_create(struct amdxdna_hwctx *hwctx)
return &fence->base;
}
+static void amdxdna_hwctx_release_expanded_heap(struct amdxdna_hwctx *hwctx)
+{
+ struct amdxdna_client *client = hwctx->client;
+ struct amdxdna_gem_obj *heap;
+ unsigned long heap_id;
+
+ mutex_lock(&client->mm_lock);
+ if (hwctx->last_attached_heap) {
+ xa_for_each_range(&client->dev_heap_xa, heap_id, heap, 1,
+ hwctx->last_attached_heap) {
+ amdxdna_gem_unpin(heap);
+ drm_gem_object_put(to_gobj(heap));
+ }
+ }
+ mutex_unlock(&client->mm_lock);
+}
+
static void amdxdna_hwctx_destroy_rcu(struct amdxdna_hwctx *hwctx,
struct srcu_struct *ss)
{
- struct amdxdna_dev *xdna = hwctx->client->xdna;
+ struct amdxdna_client *client = hwctx->client;
+ struct amdxdna_dev *xdna = client->xdna;
synchronize_srcu(ss);
/* At this point, user is not able to submit new commands */
xdna->dev_info->ops->hwctx_fini(hwctx);
+ amdxdna_hwctx_release_expanded_heap(hwctx);
kfree(hwctx->name);
kfree(hwctx);
}
@@ -238,7 +257,7 @@ int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
ret = xdna->dev_info->ops->hwctx_init(hwctx);
if (ret) {
XDNA_ERR(xdna, "Init hwctx failed, ret %d", ret);
- goto dev_exit;
+ goto release_expanded_heap;
}
hwctx->name = kasprintf(GFP_KERNEL, "hwctx.%d.%d", client->pid, hwctx->fw_ctx_id);
@@ -269,7 +288,8 @@ int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
kfree(hwctx->name);
fini_hwctx:
xdna->dev_info->ops->hwctx_fini(hwctx);
-dev_exit:
+release_expanded_heap:
+ amdxdna_hwctx_release_expanded_heap(hwctx);
drm_dev_exit(idx);
free_hwctx:
kfree(hwctx);
@@ -407,6 +427,68 @@ int amdxdna_hwctx_sync_debug_bo(struct amdxdna_client *client, u32 debug_bo_hdl)
return ret;
}
+static int amdxdna_hwctx_expand_heap(struct amdxdna_hwctx *hwctx)
+{
+ struct amdxdna_client *client = hwctx->client;
+ struct amdxdna_dev *xdna = client->xdna;
+ struct amdxdna_gem_obj *heap;
+ unsigned long heap_id, nid;
+ int ret = 0;
+
+ nid = hwctx->last_attached_heap + 1;
+ if (nid == client->dev_heap_nid)
+ goto out;
+
+ if (!xdna->dev_info->ops->hwctx_heap_expand) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
+ nid, client->dev_heap_nid) {
+ drm_gem_object_get(to_gobj(heap));
+ ret = amdxdna_gem_pin(heap);
+ if (ret) {
+ drm_gem_object_put(to_gobj(heap));
+ break;
+ }
+
+ mutex_unlock(&client->mm_lock);
+ ret = xdna->dev_info->ops->hwctx_heap_expand(hwctx, heap);
+ mutex_lock(&client->mm_lock);
+ if (ret) {
+ amdxdna_gem_unpin(heap);
+ drm_gem_object_put(to_gobj(heap));
+ break;
+ }
+
+ hwctx->last_attached_heap = heap_id;
+ }
+
+out:
+ return ret;
+}
+
+int amdxdna_update_heap(struct amdxdna_client *client, struct amdxdna_hwctx *hwctx)
+{
+ unsigned long hwctx_id;
+ int ret;
+
+ if (hwctx) {
+ guard(mutex)(&client->mm_lock);
+ return amdxdna_hwctx_expand_heap(hwctx);
+ }
+
+ guard(mutex)(&client->mm_lock);
+ amdxdna_for_each_hwctx(client, hwctx_id, hwctx) {
+ ret = amdxdna_hwctx_expand_heap(hwctx);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
static void
amdxdna_arg_bos_put(struct amdxdna_sched_job *job)
{
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
index 6e3c6371a088..aaae16430466 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -109,6 +109,7 @@ struct amdxdna_hwctx {
u32 umq_bo_hdl;
u32 doorbell_offset;
u32 num_unused_col;
+ u32 last_attached_heap;
struct amdxdna_qos_info qos;
struct amdxdna_hwctx_param_config_cu *cus;
@@ -205,6 +206,7 @@ void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
int amdxdna_hwctx_walk(struct amdxdna_client *client, void *arg,
int (*walk)(struct amdxdna_hwctx *hwctx, void *arg));
int amdxdna_hwctx_sync_debug_bo(struct amdxdna_client *client, u32 debug_bo_hdl);
+int amdxdna_update_heap(struct amdxdna_client *client, struct amdxdna_hwctx *hwctx);
int amdxdna_cmd_submit(struct amdxdna_client *client,
struct amdxdna_drv_cmd *drv_cmd, u32 cmd_bo_hdls,
diff --git a/drivers/accel/amdxdna/amdxdna_gem.c b/drivers/accel/amdxdna/amdxdna_gem.c
index 6087264ba1b5..97a521401a46 100644
--- a/drivers/accel/amdxdna/amdxdna_gem.c
+++ b/drivers/accel/amdxdna/amdxdna_gem.c
@@ -24,6 +24,77 @@
MODULE_IMPORT_NS("DMA_BUF");
+/*
+ * The dev BO could be across multiple heap BO chunks. The heap chunks should
+ * be mapped to userspace and the userspace virtual address should be
+ * contiguous.
+ */
+static int
+amdxdna_init_dev_bo(struct amdxdna_gem_obj *dev_bo)
+{
+ struct amdxdna_client *client = dev_bo->client;
+ struct amdxdna_dev *xdna = client->xdna;
+ struct amdxdna_gem_obj *heap;
+ u64 heap_addr, exp_heap_uva;
+ u32 heap_id;
+
+ if (xa_empty(&client->dev_heap_xa)) {
+ XDNA_DBG(xdna, "Empty heap xa");
+ return -EAGAIN;
+ }
+
+ for (heap_id = 0; heap_id < client->dev_heap_nid; heap_id++) {
+ heap = xa_load(&client->dev_heap_xa, heap_id);
+ if (!heap) {
+ XDNA_ERR(xdna, "Failed to load heap %d", heap_id);
+ return -EINVAL;
+ }
+ heap_addr = amdxdna_gem_dev_addr(heap);
+ if (heap_addr > dev_bo->mm_node.start)
+ break;
+ }
+
+ heap_id--;
+ heap = xa_load(&client->dev_heap_xa, heap_id);
+ exp_heap_uva = amdxdna_gem_uva(heap);
+ heap_addr = amdxdna_gem_dev_addr(heap);
+ dev_bo->heap_start_id = heap_id;
+ dev_bo->mem.uva = dev_bo->mm_node.start - heap_addr + exp_heap_uva;
+
+ for (; heap_id < client->dev_heap_nid; heap_id++) {
+ heap = xa_load(&client->dev_heap_xa, heap_id);
+ if (!heap) {
+ XDNA_ERR(xdna, "Failed to load heap %d", heap_id);
+ return -EINVAL;
+ }
+ heap_addr = amdxdna_gem_uva(heap);
+ if (heap_addr == AMDXDNA_INVALID_ADDR) {
+ XDNA_ERR(xdna, "Heap %d is not mapped", heap_id);
+ return -EAGAIN;
+ }
+
+ if (heap_addr != exp_heap_uva) {
+ XDNA_ERR(xdna, "Heap %d uva is not contiguous", heap_id);
+ return -EINVAL;
+ }
+
+ if (heap->dev_addr + heap->mem.size >=
+ dev_bo->mm_node.start + dev_bo->mem.size)
+ break;
+
+ exp_heap_uva += heap->mem.size;
+ }
+
+ if (heap_id == client->dev_heap_nid) {
+ XDNA_DBG(xdna, "Can not find heap end");
+ return -EAGAIN;
+ }
+
+ dev_bo->heap_end_id = heap_id;
+
+ return 0;
+}
+
static int
amdxdna_gem_heap_alloc(struct amdxdna_gem_obj *abo)
{
@@ -31,32 +102,22 @@ amdxdna_gem_heap_alloc(struct amdxdna_gem_obj *abo)
struct amdxdna_dev *xdna = client->xdna;
struct amdxdna_mem *mem = &abo->mem;
struct amdxdna_gem_obj *heap;
+ unsigned long heap_id;
u32 align;
int ret;
mutex_lock(&client->mm_lock);
- heap = client->dev_heap;
- if (!heap) {
- ret = -EINVAL;
- goto unlock_out;
- }
-
- if (amdxdna_gem_uva(heap) == AMDXDNA_INVALID_ADDR) {
- XDNA_ERR(xdna, "Invalid dev heap userptr");
- ret = -EINVAL;
- goto unlock_out;
- }
-
- if (mem->size == 0 || mem->size > heap->mem.size) {
- XDNA_ERR(xdna, "Invalid dev bo size 0x%lx, limit 0x%lx",
- mem->size, heap->mem.size);
+ if (!mem->size || mem->size > xdna->dev_info->dev_heap_max_size) {
+ XDNA_ERR(xdna, "Invalid dev bo size 0x%lx, max heap 0x%lx",
+ mem->size, xdna->dev_info->dev_heap_max_size);
ret = -EINVAL;
goto unlock_out;
}
align = 1 << max(PAGE_SHIFT, xdna->dev_info->dev_mem_buf_shift);
- ret = drm_mm_insert_node_generic(&heap->mm, &abo->mm_node,
+ ret = drm_mm_insert_node_generic(&client->dev_heap_mm,
+ &abo->mm_node,
mem->size, align,
0, DRM_MM_INSERT_BEST);
if (ret) {
@@ -64,9 +125,16 @@ amdxdna_gem_heap_alloc(struct amdxdna_gem_obj *abo)
goto unlock_out;
}
- client->heap_usage += mem->size;
+ ret = amdxdna_init_dev_bo(abo);
+ if (ret) {
+ drm_mm_remove_node(&abo->mm_node);
+ goto unlock_out;
+ }
- drm_gem_object_get(to_gobj(heap));
+ client->heap_usage += mem->size;
+ xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
+ abo->heap_start_id, abo->heap_end_id)
+ drm_gem_object_get(to_gobj(heap));
unlock_out:
mutex_unlock(&client->mm_lock);
@@ -79,13 +147,16 @@ amdxdna_gem_heap_free(struct amdxdna_gem_obj *abo)
{
struct amdxdna_client *client = abo->client;
struct amdxdna_gem_obj *heap;
+ unsigned long heap_id;
mutex_lock(&client->mm_lock);
drm_mm_remove_node(&abo->mm_node);
client->heap_usage -= abo->mem.size;
- heap = client->dev_heap;
- drm_gem_object_put(to_gobj(heap));
+
+ xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
+ abo->heap_start_id, abo->heap_end_id)
+ drm_gem_object_put(to_gobj(heap));
mutex_unlock(&client->mm_lock);
}
@@ -161,31 +232,13 @@ static void amdxdna_gem_vunmap(struct amdxdna_gem_obj *abo)
}
}
-/*
- * Obtain the user virtual address for accessing the BO.
- * It can be used for device to access the BO when PASID is enabled.
- */
-u64 amdxdna_gem_uva(struct amdxdna_gem_obj *abo)
-{
- if (abo->type == AMDXDNA_BO_DEV) {
- struct amdxdna_gem_obj *heap = abo->client->dev_heap;
- u64 off = amdxdna_dev_bo_offset(abo);
-
- if (amdxdna_gem_uva(heap) != AMDXDNA_INVALID_ADDR)
- return amdxdna_gem_uva(heap) + off;
- return AMDXDNA_INVALID_ADDR;
- }
-
- return abo->mem.uva;
-}
-
/*
* Obtain the address for device to access the BO.
*/
u64 amdxdna_gem_dev_addr(struct amdxdna_gem_obj *abo)
{
if (abo->type == AMDXDNA_BO_DEV_HEAP)
- return abo->client->xdna->dev_info->dev_mem_base;
+ return abo->dev_addr;
if (abo->type == AMDXDNA_BO_DEV)
return abo->mm_node.start;
return amdxdna_obj_dma_addr(abo);
@@ -569,9 +622,6 @@ static void amdxdna_gem_obj_free(struct drm_gem_object *gobj)
if (abo->pinned)
amdxdna_gem_unpin(abo);
- if (abo->type == AMDXDNA_BO_DEV_HEAP)
- drm_mm_takedown(&abo->mm);
-
amdxdna_dma_unmap_bo(xdna, abo);
amdxdna_gem_vunmap(abo);
mutex_destroy(&abo->lock);
@@ -657,11 +707,23 @@ static void amdxdna_gem_obj_vunmap(struct drm_gem_object *obj, struct iosys_map
static int amdxdna_gem_dev_obj_vmap(struct drm_gem_object *obj, struct iosys_map *map)
{
struct amdxdna_gem_obj *abo = to_xdna_obj(obj);
- void *base = amdxdna_gem_vmap(abo->client->dev_heap);
- u64 offset = amdxdna_dev_bo_offset(abo);
+ struct amdxdna_gem_obj *heap;
+ void *base;
+ u64 offset;
+
+ /* vmap dev bo which is across more than 1 heap is not allowed */
+ if (abo->heap_start_id != abo->heap_end_id)
+ return -ENOMEM;
+ heap = xa_load(&abo->client->dev_heap_xa, abo->heap_start_id);
+ if (!heap)
+ return -ENOMEM;
+
+ base = amdxdna_gem_vmap(heap);
if (!base)
return -ENOMEM;
+
+ offset = amdxdna_gem_dev_addr(abo) - amdxdna_gem_dev_addr(heap);
iosys_map_set_vaddr(map, base + offset);
return 0;
}
@@ -880,15 +942,25 @@ amdxdna_drm_create_dev_heap_bo(struct drm_device *dev,
/* Set up heap for this client. */
mutex_lock(&client->mm_lock);
- if (client->dev_heap) {
- XDNA_DBG(client->xdna, "dev heap is already created");
- ret = -EBUSY;
+ if (client->total_heap_size + abo->mem.size >
+ xdna->dev_info->dev_heap_max_size) {
+ XDNA_ERR(xdna, "Heap size 0x%lx + 0x%lx exceeds max 0x%lx",
+ client->total_heap_size, abo->mem.size,
+ xdna->dev_info->dev_heap_max_size);
+ ret = -ENOSPC;
goto mm_unlock;
}
- client->dev_heap = abo;
- drm_gem_object_get(to_gobj(abo));
- drm_mm_init(&abo->mm, xdna->dev_info->dev_mem_base, abo->mem.size);
+ ret = xa_insert(&client->dev_heap_xa, client->dev_heap_nid, abo, GFP_KERNEL);
+ if (ret) {
+ XDNA_ERR(xdna, "Add heap failed %d", ret);
+ goto mm_unlock;
+ }
+
+ abo->dev_addr = xdna->dev_info->dev_mem_base + client->total_heap_size;
+ client->total_heap_size += abo->mem.size;
+ client->dev_heap_nid++;
+ drm_gem_object_get(to_gobj(abo));
mutex_unlock(&client->mm_lock);
@@ -931,10 +1003,10 @@ amdxdna_drm_create_dev_bo(struct drm_device *dev,
ret = amdxdna_gem_heap_alloc(abo);
if (ret) {
- XDNA_ERR(xdna, "Failed to alloc dev bo memory, ret %d", ret);
amdxdna_gem_destroy_obj(abo);
return ERR_PTR(ret);
}
+
drm_gem_private_object_init(dev, gobj, aligned_sz);
return abo;
@@ -942,6 +1014,7 @@ amdxdna_drm_create_dev_bo(struct drm_device *dev,
int amdxdna_drm_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
{
+ struct amdxdna_client *client = filp->driver_priv;
struct amdxdna_dev *xdna = to_xdna_dev(dev);
struct amdxdna_drm_create_bo *args = data;
struct amdxdna_gem_obj *abo;
@@ -962,6 +1035,13 @@ int amdxdna_drm_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_f
break;
case AMDXDNA_BO_DEV:
abo = amdxdna_drm_create_dev_bo(dev, args, filp);
+ if (!IS_ERR(abo)) {
+ mutex_lock(&xdna->dev_lock);
+ ret = amdxdna_update_heap(client, NULL);
+ mutex_unlock(&xdna->dev_lock);
+ if (ret)
+ goto put_obj;
+ }
break;
default:
return -EINVAL;
@@ -985,14 +1065,11 @@ int amdxdna_drm_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_f
return ret;
}
-int amdxdna_gem_pin_nolock(struct amdxdna_gem_obj *abo)
+static int amdxdna_bo_pin(struct amdxdna_gem_obj *abo)
{
struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
int ret;
- if (abo->type == AMDXDNA_BO_DEV)
- abo = abo->client->dev_heap;
-
if (is_import_bo(abo))
return 0;
@@ -1002,6 +1079,45 @@ int amdxdna_gem_pin_nolock(struct amdxdna_gem_obj *abo)
return ret;
}
+static void amdxdna_bo_unpin(struct amdxdna_gem_obj *abo)
+{
+ struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+
+ if (is_import_bo(abo))
+ return;
+
+ drm_gem_shmem_unpin(&abo->base);
+
+ XDNA_DBG(xdna, "BO type %d", abo->type);
+}
+
+int amdxdna_gem_pin_nolock(struct amdxdna_gem_obj *abo)
+{
+ struct amdxdna_client *client = abo->client;
+ struct amdxdna_gem_obj *heap;
+ unsigned long heap_id, last = ULONG_MAX;
+ int ret = 0;
+
+ if (abo->type != AMDXDNA_BO_DEV)
+ return amdxdna_bo_pin(abo);
+
+ xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
+ abo->heap_start_id, abo->heap_end_id) {
+ ret = amdxdna_bo_pin(heap);
+ if (ret)
+ break;
+ last = heap_id;
+ }
+
+ if (ret && last <= abo->heap_end_id) {
+ xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
+ abo->heap_start_id, last)
+ amdxdna_bo_unpin(heap);
+ }
+
+ return ret;
+}
+
int amdxdna_gem_pin(struct amdxdna_gem_obj *abo)
{
int ret;
@@ -1015,14 +1131,18 @@ int amdxdna_gem_pin(struct amdxdna_gem_obj *abo)
void amdxdna_gem_unpin(struct amdxdna_gem_obj *abo)
{
- if (abo->type == AMDXDNA_BO_DEV)
- abo = abo->client->dev_heap;
+ mutex_lock(&abo->lock);
+ if (abo->type == AMDXDNA_BO_DEV) {
+ struct amdxdna_gem_obj *heap;
+ unsigned long heap_id;
- if (is_import_bo(abo))
- return;
+ xa_for_each_range(&abo->client->dev_heap_xa, heap_id, heap,
+ abo->heap_start_id, abo->heap_end_id)
+ amdxdna_bo_unpin(heap);
+ } else {
+ amdxdna_bo_unpin(abo);
+ }
- mutex_lock(&abo->lock);
- drm_gem_shmem_unpin(&abo->base);
mutex_unlock(&abo->lock);
}
@@ -1079,6 +1199,29 @@ int amdxdna_drm_get_bo_info_ioctl(struct drm_device *dev, void *data, struct drm
return ret;
}
+static int amdxdna_flush_bo(struct amdxdna_gem_obj *abo, u64 offset, u64 size)
+{
+ u64 end;
+
+ if (offset >= abo->mem.size)
+ return -EINVAL;
+
+ if (check_add_overflow(offset, size, &end))
+ return -EINVAL;
+
+ size = min(abo->mem.size, end) - offset;
+ if (is_import_bo(abo))
+ drm_clflush_sg(abo->base.sgt);
+ else if (amdxdna_gem_vmap(abo))
+ drm_clflush_virt_range(amdxdna_gem_vmap(abo) + offset, size);
+ else if (abo->base.pages)
+ drm_clflush_pages(abo->base.pages, abo->mem.size >> PAGE_SHIFT);
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
/*
* The sync bo ioctl is to make sure the CPU cache is in sync with memory.
* This is required because NPU is not cache coherent device. CPU cache
@@ -1089,11 +1232,12 @@ int amdxdna_drm_get_bo_info_ioctl(struct drm_device *dev, void *data, struct drm
int amdxdna_drm_sync_bo_ioctl(struct drm_device *dev,
void *data, struct drm_file *filp)
{
+ struct amdxdna_client *client = filp->driver_priv;
struct amdxdna_dev *xdna = to_xdna_dev(dev);
struct amdxdna_drm_sync_bo *args = data;
struct amdxdna_gem_obj *abo;
struct drm_gem_object *gobj;
- int ret;
+ int ret = 0;
gobj = drm_gem_object_lookup(filp, args->handle);
if (!gobj) {
@@ -1102,22 +1246,45 @@ int amdxdna_drm_sync_bo_ioctl(struct drm_device *dev,
}
abo = to_xdna_obj(gobj);
- ret = amdxdna_gem_pin(abo);
- if (ret) {
- XDNA_ERR(xdna, "Pin BO %d failed, ret %d", args->handle, ret);
- goto put_obj;
- }
+ if (abo->type == AMDXDNA_BO_DEV) {
+ struct amdxdna_gem_obj *heap;
+ unsigned long heap_id;
+ u64 bo_start = amdxdna_gem_dev_addr(abo);
+ u64 flush_start = bo_start + args->offset;
+ u64 flush_end = flush_start + args->size;
+
+ xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
+ abo->heap_start_id, abo->heap_end_id) {
+ u64 heap_start = amdxdna_gem_dev_addr(heap);
+ u64 heap_end = heap_start + heap->mem.size;
+ u64 start = max(flush_start, heap_start);
+ u64 end = min(flush_end, heap_end);
+
+ if (start >= end)
+ continue;
+
+ ret = amdxdna_flush_bo(heap, start - heap_start, end - start);
+ if (ret) {
+ XDNA_ERR(xdna, "Failed to flush heap %ld ret %d",
+ heap_id, ret);
+ goto put_obj;
+ }
+ }
+ } else {
+ ret = amdxdna_gem_pin(abo);
+ if (ret) {
+ XDNA_ERR(xdna, "Pin BO %d failed, ret %d", args->handle, ret);
+ goto put_obj;
+ }
- if (is_import_bo(abo))
- drm_clflush_sg(abo->base.sgt);
- else if (amdxdna_gem_vmap(abo))
- drm_clflush_virt_range(amdxdna_gem_vmap(abo) + args->offset, args->size);
- else if (abo->base.pages)
- drm_clflush_pages(abo->base.pages, gobj->size >> PAGE_SHIFT);
- else
- drm_WARN(&xdna->ddev, 1, "Can not get flush memory");
+ ret = amdxdna_flush_bo(abo, args->offset, args->size);
+ amdxdna_gem_unpin(abo);
- amdxdna_gem_unpin(abo);
+ if (ret) {
+ drm_WARN(&xdna->ddev, 1, "Can not get flush memory");
+ goto put_obj;
+ }
+ }
XDNA_DBG(xdna, "Sync bo %d offset 0x%llx, size 0x%llx\n",
args->handle, args->offset, args->size);
diff --git a/drivers/accel/amdxdna/amdxdna_gem.h b/drivers/accel/amdxdna/amdxdna_gem.h
index 162e5499f5e0..ffdb18395a54 100644
--- a/drivers/accel/amdxdna/amdxdna_gem.h
+++ b/drivers/accel/amdxdna/amdxdna_gem.h
@@ -46,8 +46,10 @@ struct amdxdna_gem_obj {
int open_ref;
/* Below members are initialized when needed */
- struct drm_mm mm; /* For AMDXDNA_BO_DEV_HEAP */
struct drm_mm_node mm_node; /* For AMDXDNA_BO_DEV */
+ u32 heap_start_id;
+ u32 heap_end_id;
+ u64 dev_addr; /* For heap bo */
u32 assigned_hwctx;
struct dma_buf *dma_buf;
struct dma_buf_attachment *attach;
@@ -73,13 +75,21 @@ static inline void amdxdna_gem_put_obj(struct amdxdna_gem_obj *abo)
drm_gem_object_put(to_gobj(abo));
}
+/*
+ * Obtain the user virtual address for accessing the BO.
+ * It can be used for device to access the BO when PASID is enabled.
+ */
+static inline u64 amdxdna_gem_uva(struct amdxdna_gem_obj *abo)
+{
+ return abo->mem.uva;
+}
+
void *amdxdna_gem_vmap(struct amdxdna_gem_obj *abo);
-u64 amdxdna_gem_uva(struct amdxdna_gem_obj *abo);
u64 amdxdna_gem_dev_addr(struct amdxdna_gem_obj *abo);
static inline u64 amdxdna_dev_bo_offset(struct amdxdna_gem_obj *abo)
{
- return amdxdna_gem_dev_addr(abo) - amdxdna_gem_dev_addr(abo->client->dev_heap);
+ return amdxdna_gem_dev_addr(abo) - abo->client->xdna->dev_info->dev_mem_base;
}
static inline u64 amdxdna_obj_dma_addr(struct amdxdna_gem_obj *abo)
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
index a6e9be7960c2..c677293c1ae7 100644
--- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
@@ -126,6 +126,9 @@ static int amdxdna_drm_open(struct drm_device *ddev, struct drm_file *filp)
mmgrab(client->mm);
init_srcu_struct(&client->hwctx_srcu);
xa_init_flags(&client->hwctx_xa, XA_FLAGS_ALLOC);
+ xa_init_flags(&client->dev_heap_xa, XA_FLAGS_ALLOC);
+ drm_mm_init(&client->dev_heap_mm, xdna->dev_info->dev_mem_base,
+ xdna->dev_info->dev_heap_max_size);
mutex_init(&client->mm_lock);
mutex_lock(&xdna->dev_lock);
@@ -141,13 +144,18 @@ static int amdxdna_drm_open(struct drm_device *ddev, struct drm_file *filp)
static void amdxdna_client_cleanup(struct amdxdna_client *client)
{
+ struct amdxdna_gem_obj *heap;
+ unsigned long heap_id;
+
list_del(&client->node);
amdxdna_hwctx_remove_all(client);
xa_destroy(&client->hwctx_xa);
cleanup_srcu_struct(&client->hwctx_srcu);
- if (client->dev_heap)
- drm_gem_object_put(to_gobj(client->dev_heap));
+ xa_for_each(&client->dev_heap_xa, heap_id, heap)
+ drm_gem_object_put(to_gobj(heap));
+ xa_destroy(&client->dev_heap_xa);
+ drm_mm_takedown(&client->dev_heap_mm);
mutex_destroy(&client->mm_lock);
mmdrop(client->mm);
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.h b/drivers/accel/amdxdna/amdxdna_pci_drv.h
index 471b72299aee..34271c14d359 100644
--- a/drivers/accel/amdxdna/amdxdna_pci_drv.h
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.h
@@ -7,6 +7,7 @@
#define _AMDXDNA_PCI_DRV_H_
#include <drm/amdxdna_accel.h>
+#include <drm/drm_mm.h>
#include <drm/drm_print.h>
#include <linux/iommu.h>
#include <linux/iova.h>
@@ -61,6 +62,7 @@ struct amdxdna_dev_ops {
void (*hwctx_fini)(struct amdxdna_hwctx *hwctx);
int (*hwctx_config)(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
int (*hwctx_sync_debug_bo)(struct amdxdna_hwctx *hwctx, u32 debug_bo_hdl);
+ int (*hwctx_heap_expand)(struct amdxdna_hwctx *hwctx, struct amdxdna_gem_obj *heap);
void (*hmm_invalidate)(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
int (*cmd_submit)(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
int (*cmd_wait)(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
@@ -95,6 +97,7 @@ struct amdxdna_dev_info {
size_t dev_mem_size;
const char *default_vbnv;
const struct amdxdna_rev_vbnv *rev_vbnv_tbl;
+ size_t dev_heap_max_size;
const struct amdxdna_dev_priv *dev_priv;
const struct amdxdna_fw_feature_tbl *fw_feature_tbl;
const struct amdxdna_dev_ops *ops;
@@ -153,7 +156,10 @@ struct amdxdna_client {
struct drm_file *filp;
struct mutex mm_lock; /* protect memory related */
- struct amdxdna_gem_obj *dev_heap;
+ struct xarray dev_heap_xa;
+ struct drm_mm dev_heap_mm;
+ u32 dev_heap_nid;
+ size_t total_heap_size;
struct iommu_sva *sva;
int pasid;
diff --git a/drivers/accel/amdxdna/npu1_regs.c b/drivers/accel/amdxdna/npu1_regs.c
index 4e48c030a69f..ca779674017a 100644
--- a/drivers/accel/amdxdna/npu1_regs.c
+++ b/drivers/accel/amdxdna/npu1_regs.c
@@ -139,6 +139,7 @@ const struct amdxdna_dev_info dev_npu1_info = {
.dev_mem_base = AIE2_DEVM_BASE,
.dev_mem_size = AIE2_DEVM_SIZE,
.default_vbnv = "RyzenAI-npu1",
+ .dev_heap_max_size = AIE2_DEVM_SIZE,
.device_type = AMDXDNA_DEV_TYPE_KMQ,
.dev_priv = &npu1_dev_priv,
.fw_feature_tbl = npu1_fw_feature_table,
diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
index eddc31803a50..15a161384625 100644
--- a/drivers/accel/amdxdna/npu4_regs.c
+++ b/drivers/accel/amdxdna/npu4_regs.c
@@ -98,6 +98,7 @@ const struct amdxdna_fw_feature_tbl npu4_fw_feature_table[] = {
{ .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor = 15 },
{ .features = BIT_U64(AIE2_UPDATE_PROPERTY), .major = 6, .min_minor = 15 },
{ .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor = 18 },
+ { .features = BIT_U64(AIE2_ADD_HOST_BUFFER), .major = 6, .min_minor = 18 },
{ .features = BIT_U64(AIE2_GET_DEV_REVISION), .major = 6, .min_minor = 24 },
{ .features = AIE2_ALL_FEATURES, .major = 7 },
{ 0 }
@@ -200,6 +201,7 @@ const struct amdxdna_dev_info dev_npu4_info = {
.dev_mem_base = AIE2_DEVM_BASE,
.dev_mem_size = AIE2_DEVM_SIZE,
.default_vbnv = "RyzenAI-npu4",
+ .dev_heap_max_size = AIE2_DEVM_MAX_SIZE,
.device_type = AMDXDNA_DEV_TYPE_KMQ,
.rev_vbnv_tbl = npu4_rev_vbnv_tbl,
.dev_priv = &npu4_dev_priv,
diff --git a/drivers/accel/amdxdna/npu5_regs.c b/drivers/accel/amdxdna/npu5_regs.c
index a9102978e4a8..306b359d0cd3 100644
--- a/drivers/accel/amdxdna/npu5_regs.c
+++ b/drivers/accel/amdxdna/npu5_regs.c
@@ -107,6 +107,7 @@ const struct amdxdna_dev_info dev_npu5_info = {
.dev_mem_base = AIE2_DEVM_BASE,
.dev_mem_size = AIE2_DEVM_SIZE,
.default_vbnv = "RyzenAI-npu5",
+ .dev_heap_max_size = AIE2_DEVM_MAX_SIZE,
.device_type = AMDXDNA_DEV_TYPE_KMQ,
.rev_vbnv_tbl = npu4_rev_vbnv_tbl,
.dev_priv = &npu5_dev_priv,
diff --git a/drivers/accel/amdxdna/npu6_regs.c b/drivers/accel/amdxdna/npu6_regs.c
index e0db3a09740b..e68637d2a228 100644
--- a/drivers/accel/amdxdna/npu6_regs.c
+++ b/drivers/accel/amdxdna/npu6_regs.c
@@ -108,6 +108,7 @@ const struct amdxdna_dev_info dev_npu6_info = {
.dev_mem_base = AIE2_DEVM_BASE,
.dev_mem_size = AIE2_DEVM_SIZE,
.default_vbnv = "RyzenAI-npu6",
+ .dev_heap_max_size = AIE2_DEVM_MAX_SIZE,
.device_type = AMDXDNA_DEV_TYPE_KMQ,
.rev_vbnv_tbl = npu4_rev_vbnv_tbl,
.dev_priv = &npu6_dev_priv,
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread* Re: [PATCH V2] accel/amdxdna: Add expandable device heap support
2026-05-15 16:19 [PATCH V2] accel/amdxdna: Add expandable device heap support Lizhi Hou
@ 2026-05-15 21:01 ` Mario Limonciello
2026-05-15 22:50 ` Claude review: " Claude Code Review Bot
2026-05-15 22:50 ` Claude Code Review Bot
2 siblings, 0 replies; 4+ messages in thread
From: Mario Limonciello @ 2026-05-15 21:01 UTC (permalink / raw)
To: Lizhi Hou, ogabbay, quic_jhugo, dri-devel, karol.wachowski
Cc: linux-kernel, max.zhen, sonal.santan, Wendy Liang
On 5/15/26 11:19, Lizhi Hou wrote:
> Introduce an expandable device heap to avoid allocating a large heap
> upfront. Start with a smaller initial heap and grow it on demand.
> Return -EAGAIN when BO allocation fails due to insufficient heap space,
> allowing userspace to trigger heap expansion via a heap BO creation
> IOCTL and retry the allocation.
>
> Manage heap chunks using an xarray. On expansion, register new chunks
> with the firmware via MSG_OP_ADD_HOST_BUFFER.
>
> Since heap shrinking is not supported by the firmware, release all heap
> chunks on device close.
>
> Co-developed-by: Wendy Liang <wendy.liang@amd.com>
> Signed-off-by: Wendy Liang <wendy.liang@amd.com>
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> ---
> v2:
> Fix memory leak when initializing hardware context failed.
> For flush BO, only flush the requested area.
I have some minor feedback below that can be follow ups.
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
>
> drivers/accel/amdxdna/aie2_ctx.c | 45 +++-
> drivers/accel/amdxdna/aie2_message.c | 52 +++-
> drivers/accel/amdxdna/aie2_msg_priv.h | 1 +
> drivers/accel/amdxdna/aie2_pci.c | 1 +
> drivers/accel/amdxdna/aie2_pci.h | 8 +-
> drivers/accel/amdxdna/amdxdna_ctx.c | 88 ++++++-
> drivers/accel/amdxdna/amdxdna_ctx.h | 2 +
> drivers/accel/amdxdna/amdxdna_gem.c | 319 ++++++++++++++++++------
> drivers/accel/amdxdna/amdxdna_gem.h | 16 +-
> drivers/accel/amdxdna/amdxdna_pci_drv.c | 12 +-
> drivers/accel/amdxdna/amdxdna_pci_drv.h | 8 +-
> drivers/accel/amdxdna/npu1_regs.c | 1 +
> drivers/accel/amdxdna/npu4_regs.c | 2 +
> drivers/accel/amdxdna/npu5_regs.c | 1 +
> drivers/accel/amdxdna/npu6_regs.c | 1 +
> 15 files changed, 460 insertions(+), 97 deletions(-)
>
> diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
> index 7d6094aefb6f..658a5fb1fda6 100644
> --- a/drivers/accel/amdxdna/aie2_ctx.c
> +++ b/drivers/accel/amdxdna/aie2_ctx.c
> @@ -91,6 +91,7 @@ static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwct
> static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx)
> {
> struct amdxdna_gem_obj *heap = hwctx->priv->heap;
> + unsigned long heap_id;
> int ret;
>
> ret = aie2_create_context(xdna->dev_handle, hwctx);
> @@ -107,6 +108,17 @@ static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hw
> goto out;
> }
>
> + xa_for_each_range(&hwctx->client->dev_heap_xa, heap_id, heap, 1,
> + hwctx->last_attached_heap) {
> + ret = aie2_add_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
> + amdxdna_obj_dma_addr(heap),
> + heap->mem.size);
> + if (ret) {
> + XDNA_ERR(xdna, "Add heap %ld failed ret %d", heap_id, ret);
> + goto out;
> + }
> + }
> +
> ret = aie2_config_cu(hwctx, NULL);
> if (ret) {
> XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
> @@ -650,7 +662,7 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
> hwctx->priv = priv;
>
> mutex_lock(&client->mm_lock);
> - heap = client->dev_heap;
> + heap = xa_load(&client->dev_heap_xa, 0);
> if (!heap) {
> XDNA_ERR(xdna, "The client dev heap object not exist");
> mutex_unlock(&client->mm_lock);
> @@ -732,6 +744,12 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
> goto release_resource;
> }
>
> + ret = amdxdna_update_heap(client, hwctx);
> + if (ret) {
> + XDNA_ERR(xdna, "Update heap failed, ret %d", ret);
> + goto release_resource;
> + }
> +
> ret = aie2_ctx_syncobj_create(hwctx);
> if (ret) {
> XDNA_ERR(xdna, "Create syncobj failed, ret %d", ret);
> @@ -1161,3 +1179,28 @@ void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo,
> else if (ret == -ERESTARTSYS)
> XDNA_DBG(xdna, "Wait for bo interrupted by signal");
> }
> +
> +int aie2_hwctx_heap_expand(struct amdxdna_hwctx *hwctx,
> + struct amdxdna_gem_obj *heap)
> +{
> + struct amdxdna_client *client = hwctx->client;
> + struct amdxdna_dev *xdna = client->xdna;
> + u64 addr;
> + int ret;
> +
> + ret = amdxdna_pm_resume_get_locked(xdna);
> + if (ret)
> + return ret;
> +
> + addr = amdxdna_obj_dma_addr(heap);
> + ret = aie2_add_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
> + addr, heap->mem.size);
> + if (ret) {
> + XDNA_ERR(xdna, "Add heap failed hwctx %s 0x%lx ret %d",
> + hwctx->name, heap->mem.size, ret);
> + }
> +
> + amdxdna_pm_suspend_put(xdna);
> +
> + return ret;
> +}
> diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
> index 1504295a21e4..c4b364801cc0 100644
> --- a/drivers/accel/amdxdna/aie2_message.c
> +++ b/drivers/accel/amdxdna/aie2_message.c
> @@ -301,25 +301,59 @@ int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwc
> return ret;
> }
>
> -int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size)
> +static int aie2_send_host_buf_msgs(struct amdxdna_dev_hdl *ndev, u32 context_id,
> + u64 addr, u64 size, u32 initial_opcode)
> {
> DECLARE_AIE_MSG(map_host_buffer, MSG_OP_MAP_HOST_BUFFER);
> struct amdxdna_dev *xdna = ndev->aie.xdna;
> + size_t chunk_size;
> int ret;
>
> - req.context_id = context_id;
> - req.buf_addr = addr;
> - req.buf_size = size;
> - ret = aie_send_mgmt_msg_wait(&ndev->aie, &msg);
> - if (ret)
> - return ret;
> + chunk_size = xdna->dev_info->dev_mem_size;
> + if (!size || !IS_ALIGNED(size, chunk_size)) {
> + XDNA_ERR(xdna, "Invalid size 0x%llx for chunk 0x%lx",
> + size, chunk_size);
> + return -EINVAL;
> + }
>
> - XDNA_DBG(xdna, "fw ctx %d map host buf addr 0x%llx size 0x%llx",
> - context_id, addr, size);
> + msg.opcode = initial_opcode;
> + do {
> + req.context_id = context_id;
> + req.buf_addr = addr;
> + req.buf_size = chunk_size;
> + ret = aie_send_mgmt_msg_wait(&ndev->aie, &msg);
> + if (ret) {
> + XDNA_ERR(xdna, "fw ctx %d addr 0x%llx size 0x%lx",
> + context_id, addr, chunk_size);
> + return ret;
> + }
> +
> + XDNA_DBG(xdna, "fw ctx %d host buf op 0x%x addr 0x%llx size 0x%lx",
> + context_id, msg.opcode, addr, chunk_size);
> +
> + addr += chunk_size;
> + size -= chunk_size;
> + msg.opcode = MSG_OP_ADD_HOST_BUFFER;
> + } while (size);
>
> return 0;
> }
>
> +int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size)
> +{
> + return aie2_send_host_buf_msgs(ndev, context_id, addr, size,
> + MSG_OP_MAP_HOST_BUFFER);
> +}
> +
> +int aie2_add_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size)
> +{
> + if (!AIE_FEATURE_ON(&ndev->aie, AIE2_ADD_HOST_BUFFER))
> + return -EOPNOTSUPP;
> +
> + return aie2_send_host_buf_msgs(ndev, context_id, addr, size,
> + MSG_OP_ADD_HOST_BUFFER);
> +}
> +
> static int amdxdna_hwctx_col_map(struct amdxdna_hwctx *hwctx, void *arg)
> {
> u32 *bitmap = arg;
> diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h
> index a41c9797e265..fd65a4236d49 100644
> --- a/drivers/accel/amdxdna/aie2_msg_priv.h
> +++ b/drivers/accel/amdxdna/aie2_msg_priv.h
> @@ -33,6 +33,7 @@ enum aie2_msg_opcode {
> MSG_OP_REGISTER_ASYNC_EVENT_MSG = 0x10C,
> MSG_OP_UPDATE_PROPERTY = 0x113,
> MSG_OP_GET_APP_HEALTH = 0x114,
> + MSG_OP_ADD_HOST_BUFFER = 0x115,
> MSG_OP_GET_DEV_REVISION = 0x117,
> MSG_OP_MAX_DRV_OPCODE,
> MSG_OP_GET_PROTOCOL_VERSION = 0x301,
> diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
> index 6c8a0f70b73d..c4d345d4c76b 100644
> --- a/drivers/accel/amdxdna/aie2_pci.c
> +++ b/drivers/accel/amdxdna/aie2_pci.c
> @@ -1241,4 +1241,5 @@ const struct amdxdna_dev_ops aie2_ops = {
> .hmm_invalidate = aie2_hmm_invalidate,
> .get_array = aie2_get_array,
> .get_dev_revision = aie2_get_dev_rev,
> + .hwctx_heap_expand = aie2_hwctx_heap_expand,
> };
> diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
> index 84bdb3f8b8f9..77648cc548b6 100644
> --- a/drivers/accel/amdxdna/aie2_pci.h
> +++ b/drivers/accel/amdxdna/aie2_pci.h
> @@ -15,8 +15,9 @@
> #include "amdxdna_mailbox.h"
>
> /* Firmware determines device memory base address and size */
> -#define AIE2_DEVM_BASE 0x4000000
> -#define AIE2_DEVM_SIZE SZ_64M
> +#define AIE2_DEVM_BASE 0x4000000
> +#define AIE2_DEVM_SIZE SZ_64M
> +#define AIE2_DEVM_MAX_SIZE SZ_512M
>
> #define NDEV2PDEV(ndev) (to_pci_dev((ndev)->aie.xdna->ddev.dev))
>
> @@ -198,6 +199,7 @@ enum aie2_fw_feature {
> AIE2_PREEMPT,
> AIE2_TEMPORAL_ONLY,
> AIE2_APP_HEALTH,
> + AIE2_ADD_HOST_BUFFER,
> AIE2_UPDATE_PROPERTY,
> AIE2_GET_DEV_REVISION,
> AIE2_FEATURE_MAX
> @@ -271,6 +273,7 @@ int aie2_get_dev_revision(struct amdxdna_dev_hdl *ndev, enum aie2_dev_revision *
> int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
> int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
> int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
> +int aie2_add_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
> int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf, u32 size, u32 *cols_filled);
> int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
> char __user *buf, u32 size,
> @@ -302,5 +305,6 @@ void aie2_hwctx_suspend(struct amdxdna_client *client);
> int aie2_hwctx_resume(struct amdxdna_client *client);
> int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
> void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
> +int aie2_hwctx_heap_expand(struct amdxdna_hwctx *hwctx, struct amdxdna_gem_obj *heap);
>
> #endif /* _AIE2_PCI_H_ */
> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
> index b79229a63af3..34466916a6ab 100644
> --- a/drivers/accel/amdxdna/amdxdna_ctx.c
> +++ b/drivers/accel/amdxdna/amdxdna_ctx.c
> @@ -61,16 +61,35 @@ static struct dma_fence *amdxdna_fence_create(struct amdxdna_hwctx *hwctx)
> return &fence->base;
> }
>
> +static void amdxdna_hwctx_release_expanded_heap(struct amdxdna_hwctx *hwctx)
> +{
> + struct amdxdna_client *client = hwctx->client;
> + struct amdxdna_gem_obj *heap;
> + unsigned long heap_id;
> +
> + mutex_lock(&client->mm_lock);
> + if (hwctx->last_attached_heap) {
> + xa_for_each_range(&client->dev_heap_xa, heap_id, heap, 1,
> + hwctx->last_attached_heap) {
> + amdxdna_gem_unpin(heap);
> + drm_gem_object_put(to_gobj(heap));
> + }
> + }
> + mutex_unlock(&client->mm_lock);
> +}
> +
> static void amdxdna_hwctx_destroy_rcu(struct amdxdna_hwctx *hwctx,
> struct srcu_struct *ss)
> {
> - struct amdxdna_dev *xdna = hwctx->client->xdna;
> + struct amdxdna_client *client = hwctx->client;
> + struct amdxdna_dev *xdna = client->xdna;
>
> synchronize_srcu(ss);
>
> /* At this point, user is not able to submit new commands */
> xdna->dev_info->ops->hwctx_fini(hwctx);
>
> + amdxdna_hwctx_release_expanded_heap(hwctx);
> kfree(hwctx->name);
> kfree(hwctx);
> }
> @@ -238,7 +257,7 @@ int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
> ret = xdna->dev_info->ops->hwctx_init(hwctx);
> if (ret) {
> XDNA_ERR(xdna, "Init hwctx failed, ret %d", ret);
> - goto dev_exit;
> + goto release_expanded_heap;
> }
There's an existing issue here, Sahiko found it too. You can't use goto
with guard(mutex). Because of how the guard() macro works you break
cleanups. You either need to do the release with at __free macro or
cleanup in each path or switch away from guard.
>
> hwctx->name = kasprintf(GFP_KERNEL, "hwctx.%d.%d", client->pid, hwctx->fw_ctx_id);
> @@ -269,7 +288,8 @@ int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
> kfree(hwctx->name);
> fini_hwctx:
> xdna->dev_info->ops->hwctx_fini(hwctx);
> -dev_exit:
> +release_expanded_heap:
> + amdxdna_hwctx_release_expanded_heap(hwctx);
> drm_dev_exit(idx);
> free_hwctx:
> kfree(hwctx);
> @@ -407,6 +427,68 @@ int amdxdna_hwctx_sync_debug_bo(struct amdxdna_client *client, u32 debug_bo_hdl)
> return ret;
> }
>
> +static int amdxdna_hwctx_expand_heap(struct amdxdna_hwctx *hwctx)
> +{
> + struct amdxdna_client *client = hwctx->client;
> + struct amdxdna_dev *xdna = client->xdna;
> + struct amdxdna_gem_obj *heap;
> + unsigned long heap_id, nid;
> + int ret = 0;
> +
> + nid = hwctx->last_attached_heap + 1;
> + if (nid == client->dev_heap_nid)
> + goto out;
> +
> + if (!xdna->dev_info->ops->hwctx_heap_expand) {
> + ret = -EOPNOTSUPP;
> + goto out;
> + }
I would re-order these.
If hwctx_heap_expand isn't available for the HW then the last attached
heap should never have been set right?
> +
> + xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
> + nid, client->dev_heap_nid) {
> + drm_gem_object_get(to_gobj(heap));
> + ret = amdxdna_gem_pin(heap);
> + if (ret) {
> + drm_gem_object_put(to_gobj(heap));
> + break;
> + }
> +
> + mutex_unlock(&client->mm_lock);
> + ret = xdna->dev_info->ops->hwctx_heap_expand(hwctx, heap);
> + mutex_lock(&client->mm_lock);
> + if (ret) {
> + amdxdna_gem_unpin(heap);
> + drm_gem_object_put(to_gobj(heap));
> + break;
> + }
> +
> + hwctx->last_attached_heap = heap_id;
> + }
> +
> +out:
> + return ret;
> +}
> +
> +int amdxdna_update_heap(struct amdxdna_client *client, struct amdxdna_hwctx *hwctx)
> +{
> + unsigned long hwctx_id;
> + int ret;
> +
> + if (hwctx) {
> + guard(mutex)(&client->mm_lock);
Since both paths in this function use the guard, just uplevel it so you
only need it once.
> + return amdxdna_hwctx_expand_heap(hwctx);
> + }
> +
> + guard(mutex)(&client->mm_lock);
> + amdxdna_for_each_hwctx(client, hwctx_id, hwctx) {
> + ret = amdxdna_hwctx_expand_heap(hwctx);
> + if (ret)
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> static void
> amdxdna_arg_bos_put(struct amdxdna_sched_job *job)
> {
> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
> index 6e3c6371a088..aaae16430466 100644
> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
> @@ -109,6 +109,7 @@ struct amdxdna_hwctx {
> u32 umq_bo_hdl;
> u32 doorbell_offset;
> u32 num_unused_col;
> + u32 last_attached_heap;
>
> struct amdxdna_qos_info qos;
> struct amdxdna_hwctx_param_config_cu *cus;
> @@ -205,6 +206,7 @@ void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
> int amdxdna_hwctx_walk(struct amdxdna_client *client, void *arg,
> int (*walk)(struct amdxdna_hwctx *hwctx, void *arg));
> int amdxdna_hwctx_sync_debug_bo(struct amdxdna_client *client, u32 debug_bo_hdl);
> +int amdxdna_update_heap(struct amdxdna_client *client, struct amdxdna_hwctx *hwctx);
>
> int amdxdna_cmd_submit(struct amdxdna_client *client,
> struct amdxdna_drv_cmd *drv_cmd, u32 cmd_bo_hdls,
> diff --git a/drivers/accel/amdxdna/amdxdna_gem.c b/drivers/accel/amdxdna/amdxdna_gem.c
> index 6087264ba1b5..97a521401a46 100644
> --- a/drivers/accel/amdxdna/amdxdna_gem.c
> +++ b/drivers/accel/amdxdna/amdxdna_gem.c
> @@ -24,6 +24,77 @@
>
> MODULE_IMPORT_NS("DMA_BUF");
>
> +/*
> + * The dev BO could be across multiple heap BO chunks. The heap chunks should
> + * be mapped to userspace and the userspace virtual address should be
> + * contiguous.
> + */
> +static int
> +amdxdna_init_dev_bo(struct amdxdna_gem_obj *dev_bo)
> +{
> + struct amdxdna_client *client = dev_bo->client;
> + struct amdxdna_dev *xdna = client->xdna;
> + struct amdxdna_gem_obj *heap;
> + u64 heap_addr, exp_heap_uva;
> + u32 heap_id;
> +
> + if (xa_empty(&client->dev_heap_xa)) {
> + XDNA_DBG(xdna, "Empty heap xa");
> + return -EAGAIN;
> + }
> +
> + for (heap_id = 0; heap_id < client->dev_heap_nid; heap_id++) {
> + heap = xa_load(&client->dev_heap_xa, heap_id);
> + if (!heap) {
> + XDNA_ERR(xdna, "Failed to load heap %d", heap_id);
> + return -EINVAL;
> + }
> + heap_addr = amdxdna_gem_dev_addr(heap);
> + if (heap_addr > dev_bo->mm_node.start)
> + break;
> + }
> +
> + heap_id--;
> + heap = xa_load(&client->dev_heap_xa, heap_id);
> + exp_heap_uva = amdxdna_gem_uva(heap);
> + heap_addr = amdxdna_gem_dev_addr(heap);
> + dev_bo->heap_start_id = heap_id;
> + dev_bo->mem.uva = dev_bo->mm_node.start - heap_addr + exp_heap_uva;
> +
> + for (; heap_id < client->dev_heap_nid; heap_id++) {
> + heap = xa_load(&client->dev_heap_xa, heap_id);
> + if (!heap) {
> + XDNA_ERR(xdna, "Failed to load heap %d", heap_id);
> + return -EINVAL;
> + }
> + heap_addr = amdxdna_gem_uva(heap);
> + if (heap_addr == AMDXDNA_INVALID_ADDR) {
> + XDNA_ERR(xdna, "Heap %d is not mapped", heap_id);
> + return -EAGAIN;
> + }
> +
> + if (heap_addr != exp_heap_uva) {
> + XDNA_ERR(xdna, "Heap %d uva is not contiguous", heap_id);
> + return -EINVAL;
> + }
> +
> + if (heap->dev_addr + heap->mem.size >=
> + dev_bo->mm_node.start + dev_bo->mem.size)
> + break;
> +
> + exp_heap_uva += heap->mem.size;
> + }
> +
> + if (heap_id == client->dev_heap_nid) {
> + XDNA_DBG(xdna, "Can not find heap end");
> + return -EAGAIN;
> + }
> +
> + dev_bo->heap_end_id = heap_id;
> +
> + return 0;
> +}
> +
> static int
> amdxdna_gem_heap_alloc(struct amdxdna_gem_obj *abo)
> {
> @@ -31,32 +102,22 @@ amdxdna_gem_heap_alloc(struct amdxdna_gem_obj *abo)
> struct amdxdna_dev *xdna = client->xdna;
> struct amdxdna_mem *mem = &abo->mem;
> struct amdxdna_gem_obj *heap;
> + unsigned long heap_id;
> u32 align;
> int ret;
>
> mutex_lock(&client->mm_lock);
>
> - heap = client->dev_heap;
> - if (!heap) {
> - ret = -EINVAL;
> - goto unlock_out;
> - }
> -
> - if (amdxdna_gem_uva(heap) == AMDXDNA_INVALID_ADDR) {
> - XDNA_ERR(xdna, "Invalid dev heap userptr");
> - ret = -EINVAL;
> - goto unlock_out;
> - }
> -
> - if (mem->size == 0 || mem->size > heap->mem.size) {
> - XDNA_ERR(xdna, "Invalid dev bo size 0x%lx, limit 0x%lx",
> - mem->size, heap->mem.size);
> + if (!mem->size || mem->size > xdna->dev_info->dev_heap_max_size) {
> + XDNA_ERR(xdna, "Invalid dev bo size 0x%lx, max heap 0x%lx",
> + mem->size, xdna->dev_info->dev_heap_max_size);
> ret = -EINVAL;
> goto unlock_out;
> }
>
> align = 1 << max(PAGE_SHIFT, xdna->dev_info->dev_mem_buf_shift);
> - ret = drm_mm_insert_node_generic(&heap->mm, &abo->mm_node,
> + ret = drm_mm_insert_node_generic(&client->dev_heap_mm,
> + &abo->mm_node,
> mem->size, align,
> 0, DRM_MM_INSERT_BEST);
> if (ret) {
> @@ -64,9 +125,16 @@ amdxdna_gem_heap_alloc(struct amdxdna_gem_obj *abo)
> goto unlock_out;
> }
>
> - client->heap_usage += mem->size;
> + ret = amdxdna_init_dev_bo(abo);
> + if (ret) {
> + drm_mm_remove_node(&abo->mm_node);
> + goto unlock_out;
> + }
>
> - drm_gem_object_get(to_gobj(heap));
> + client->heap_usage += mem->size;
> + xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
> + abo->heap_start_id, abo->heap_end_id)
> + drm_gem_object_get(to_gobj(heap));
>
> unlock_out:
> mutex_unlock(&client->mm_lock);
> @@ -79,13 +147,16 @@ amdxdna_gem_heap_free(struct amdxdna_gem_obj *abo)
> {
> struct amdxdna_client *client = abo->client;
> struct amdxdna_gem_obj *heap;
> + unsigned long heap_id;
>
> mutex_lock(&client->mm_lock);
>
> drm_mm_remove_node(&abo->mm_node);
> client->heap_usage -= abo->mem.size;
> - heap = client->dev_heap;
> - drm_gem_object_put(to_gobj(heap));
> +
> + xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
> + abo->heap_start_id, abo->heap_end_id)
> + drm_gem_object_put(to_gobj(heap));
>
> mutex_unlock(&client->mm_lock);
> }
> @@ -161,31 +232,13 @@ static void amdxdna_gem_vunmap(struct amdxdna_gem_obj *abo)
> }
> }
>
> -/*
> - * Obtain the user virtual address for accessing the BO.
> - * It can be used for device to access the BO when PASID is enabled.
> - */
> -u64 amdxdna_gem_uva(struct amdxdna_gem_obj *abo)
> -{
> - if (abo->type == AMDXDNA_BO_DEV) {
> - struct amdxdna_gem_obj *heap = abo->client->dev_heap;
> - u64 off = amdxdna_dev_bo_offset(abo);
> -
> - if (amdxdna_gem_uva(heap) != AMDXDNA_INVALID_ADDR)
> - return amdxdna_gem_uva(heap) + off;
> - return AMDXDNA_INVALID_ADDR;
> - }
> -
> - return abo->mem.uva;
> -}
> -
> /*
> * Obtain the address for device to access the BO.
> */
> u64 amdxdna_gem_dev_addr(struct amdxdna_gem_obj *abo)
> {
> if (abo->type == AMDXDNA_BO_DEV_HEAP)
> - return abo->client->xdna->dev_info->dev_mem_base;
> + return abo->dev_addr;
> if (abo->type == AMDXDNA_BO_DEV)
> return abo->mm_node.start;
> return amdxdna_obj_dma_addr(abo);
> @@ -569,9 +622,6 @@ static void amdxdna_gem_obj_free(struct drm_gem_object *gobj)
> if (abo->pinned)
> amdxdna_gem_unpin(abo);
>
> - if (abo->type == AMDXDNA_BO_DEV_HEAP)
> - drm_mm_takedown(&abo->mm);
> -
> amdxdna_dma_unmap_bo(xdna, abo);
> amdxdna_gem_vunmap(abo);
> mutex_destroy(&abo->lock);
> @@ -657,11 +707,23 @@ static void amdxdna_gem_obj_vunmap(struct drm_gem_object *obj, struct iosys_map
> static int amdxdna_gem_dev_obj_vmap(struct drm_gem_object *obj, struct iosys_map *map)
> {
> struct amdxdna_gem_obj *abo = to_xdna_obj(obj);
> - void *base = amdxdna_gem_vmap(abo->client->dev_heap);
> - u64 offset = amdxdna_dev_bo_offset(abo);
> + struct amdxdna_gem_obj *heap;
> + void *base;
> + u64 offset;
> +
> + /* vmap dev bo which is across more than 1 heap is not allowed */
> + if (abo->heap_start_id != abo->heap_end_id)
> + return -ENOMEM;
>
> + heap = xa_load(&abo->client->dev_heap_xa, abo->heap_start_id);
> + if (!heap)
> + return -ENOMEM;
> +
> + base = amdxdna_gem_vmap(heap);
> if (!base)
> return -ENOMEM;
> +
> + offset = amdxdna_gem_dev_addr(abo) - amdxdna_gem_dev_addr(heap);
> iosys_map_set_vaddr(map, base + offset);
> return 0;
> }
> @@ -880,15 +942,25 @@ amdxdna_drm_create_dev_heap_bo(struct drm_device *dev,
> /* Set up heap for this client. */
> mutex_lock(&client->mm_lock);
>
> - if (client->dev_heap) {
> - XDNA_DBG(client->xdna, "dev heap is already created");
> - ret = -EBUSY;
> + if (client->total_heap_size + abo->mem.size >
> + xdna->dev_info->dev_heap_max_size) {
> + XDNA_ERR(xdna, "Heap size 0x%lx + 0x%lx exceeds max 0x%lx",
> + client->total_heap_size, abo->mem.size,
> + xdna->dev_info->dev_heap_max_size);
> + ret = -ENOSPC;
> goto mm_unlock;
> }
> - client->dev_heap = abo;
> - drm_gem_object_get(to_gobj(abo));
>
> - drm_mm_init(&abo->mm, xdna->dev_info->dev_mem_base, abo->mem.size);
> + ret = xa_insert(&client->dev_heap_xa, client->dev_heap_nid, abo, GFP_KERNEL);
> + if (ret) {
> + XDNA_ERR(xdna, "Add heap failed %d", ret);
> + goto mm_unlock;
> + }
> +
> + abo->dev_addr = xdna->dev_info->dev_mem_base + client->total_heap_size;
> + client->total_heap_size += abo->mem.size;
> + client->dev_heap_nid++;
> + drm_gem_object_get(to_gobj(abo));
>
> mutex_unlock(&client->mm_lock);
>
> @@ -931,10 +1003,10 @@ amdxdna_drm_create_dev_bo(struct drm_device *dev,
>
> ret = amdxdna_gem_heap_alloc(abo);
> if (ret) {
> - XDNA_ERR(xdna, "Failed to alloc dev bo memory, ret %d", ret);
> amdxdna_gem_destroy_obj(abo);
> return ERR_PTR(ret);
> }
> +
> drm_gem_private_object_init(dev, gobj, aligned_sz);
>
> return abo;
> @@ -942,6 +1014,7 @@ amdxdna_drm_create_dev_bo(struct drm_device *dev,
>
> int amdxdna_drm_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> {
> + struct amdxdna_client *client = filp->driver_priv;
> struct amdxdna_dev *xdna = to_xdna_dev(dev);
> struct amdxdna_drm_create_bo *args = data;
> struct amdxdna_gem_obj *abo;
> @@ -962,6 +1035,13 @@ int amdxdna_drm_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_f
> break;
> case AMDXDNA_BO_DEV:
> abo = amdxdna_drm_create_dev_bo(dev, args, filp);
> + if (!IS_ERR(abo)) {
> + mutex_lock(&xdna->dev_lock);
> + ret = amdxdna_update_heap(client, NULL);
> + mutex_unlock(&xdna->dev_lock);
> + if (ret)
> + goto put_obj;
> + }
> break;
> default:
> return -EINVAL;
> @@ -985,14 +1065,11 @@ int amdxdna_drm_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_f
> return ret;
> }
>
> -int amdxdna_gem_pin_nolock(struct amdxdna_gem_obj *abo)
> +static int amdxdna_bo_pin(struct amdxdna_gem_obj *abo)
> {
> struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
> int ret;
>
> - if (abo->type == AMDXDNA_BO_DEV)
> - abo = abo->client->dev_heap;
> -
> if (is_import_bo(abo))
> return 0;
>
> @@ -1002,6 +1079,45 @@ int amdxdna_gem_pin_nolock(struct amdxdna_gem_obj *abo)
> return ret;
> }
>
> +static void amdxdna_bo_unpin(struct amdxdna_gem_obj *abo)
> +{
> + struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
> +
> + if (is_import_bo(abo))
> + return;
> +
> + drm_gem_shmem_unpin(&abo->base);
> +
> + XDNA_DBG(xdna, "BO type %d", abo->type);
> +}
> +
> +int amdxdna_gem_pin_nolock(struct amdxdna_gem_obj *abo)
> +{
> + struct amdxdna_client *client = abo->client;
> + struct amdxdna_gem_obj *heap;
> + unsigned long heap_id, last = ULONG_MAX;
> + int ret = 0;
> +
> + if (abo->type != AMDXDNA_BO_DEV)
> + return amdxdna_bo_pin(abo);
> +
> + xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
> + abo->heap_start_id, abo->heap_end_id) {
> + ret = amdxdna_bo_pin(heap);
> + if (ret)
> + break;
> + last = heap_id;
> + }
> +
> + if (ret && last <= abo->heap_end_id) {
> + xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
> + abo->heap_start_id, last)
> + amdxdna_bo_unpin(heap);
> + }
> +
> + return ret;
> +}
> +
> int amdxdna_gem_pin(struct amdxdna_gem_obj *abo)
> {
> int ret;
> @@ -1015,14 +1131,18 @@ int amdxdna_gem_pin(struct amdxdna_gem_obj *abo)
>
> void amdxdna_gem_unpin(struct amdxdna_gem_obj *abo)
> {
> - if (abo->type == AMDXDNA_BO_DEV)
> - abo = abo->client->dev_heap;
> + mutex_lock(&abo->lock);
> + if (abo->type == AMDXDNA_BO_DEV) {
> + struct amdxdna_gem_obj *heap;
> + unsigned long heap_id;
>
> - if (is_import_bo(abo))
> - return;
> + xa_for_each_range(&abo->client->dev_heap_xa, heap_id, heap,
> + abo->heap_start_id, abo->heap_end_id)
> + amdxdna_bo_unpin(heap);
> + } else {
> + amdxdna_bo_unpin(abo);
> + }
>
> - mutex_lock(&abo->lock);
> - drm_gem_shmem_unpin(&abo->base);
> mutex_unlock(&abo->lock);
> }
>
> @@ -1079,6 +1199,29 @@ int amdxdna_drm_get_bo_info_ioctl(struct drm_device *dev, void *data, struct drm
> return ret;
> }
>
> +static int amdxdna_flush_bo(struct amdxdna_gem_obj *abo, u64 offset, u64 size)
> +{
> + u64 end;
> +
> + if (offset >= abo->mem.size)
> + return -EINVAL;
> +
> + if (check_add_overflow(offset, size, &end))
> + return -EINVAL;
> +
> + size = min(abo->mem.size, end) - offset;
> + if (is_import_bo(abo))
> + drm_clflush_sg(abo->base.sgt);
> + else if (amdxdna_gem_vmap(abo))
> + drm_clflush_virt_range(amdxdna_gem_vmap(abo) + offset, size);
> + else if (abo->base.pages)
> + drm_clflush_pages(abo->base.pages, abo->mem.size >> PAGE_SHIFT);
> + else
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> /*
> * The sync bo ioctl is to make sure the CPU cache is in sync with memory.
> * This is required because NPU is not cache coherent device. CPU cache
> @@ -1089,11 +1232,12 @@ int amdxdna_drm_get_bo_info_ioctl(struct drm_device *dev, void *data, struct drm
> int amdxdna_drm_sync_bo_ioctl(struct drm_device *dev,
> void *data, struct drm_file *filp)
> {
> + struct amdxdna_client *client = filp->driver_priv;
> struct amdxdna_dev *xdna = to_xdna_dev(dev);
> struct amdxdna_drm_sync_bo *args = data;
> struct amdxdna_gem_obj *abo;
> struct drm_gem_object *gobj;
> - int ret;
> + int ret = 0;
>
> gobj = drm_gem_object_lookup(filp, args->handle);
> if (!gobj) {
> @@ -1102,22 +1246,45 @@ int amdxdna_drm_sync_bo_ioctl(struct drm_device *dev,
> }
> abo = to_xdna_obj(gobj);
>
> - ret = amdxdna_gem_pin(abo);
> - if (ret) {
> - XDNA_ERR(xdna, "Pin BO %d failed, ret %d", args->handle, ret);
> - goto put_obj;
> - }
> + if (abo->type == AMDXDNA_BO_DEV) {
> + struct amdxdna_gem_obj *heap;
> + unsigned long heap_id;
> + u64 bo_start = amdxdna_gem_dev_addr(abo);
> + u64 flush_start = bo_start + args->offset;
> + u64 flush_end = flush_start + args->size;
> +
> + xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
> + abo->heap_start_id, abo->heap_end_id) {
> + u64 heap_start = amdxdna_gem_dev_addr(heap);
> + u64 heap_end = heap_start + heap->mem.size;
> + u64 start = max(flush_start, heap_start);
> + u64 end = min(flush_end, heap_end);
> +
> + if (start >= end)
> + continue;
> +
> + ret = amdxdna_flush_bo(heap, start - heap_start, end - start);
> + if (ret) {
> + XDNA_ERR(xdna, "Failed to flush heap %ld ret %d",
> + heap_id, ret);
> + goto put_obj;
> + }
> + }
> + } else {
> + ret = amdxdna_gem_pin(abo);
> + if (ret) {
> + XDNA_ERR(xdna, "Pin BO %d failed, ret %d", args->handle, ret);
> + goto put_obj;
> + }
>
> - if (is_import_bo(abo))
> - drm_clflush_sg(abo->base.sgt);
> - else if (amdxdna_gem_vmap(abo))
> - drm_clflush_virt_range(amdxdna_gem_vmap(abo) + args->offset, args->size);
> - else if (abo->base.pages)
> - drm_clflush_pages(abo->base.pages, gobj->size >> PAGE_SHIFT);
> - else
> - drm_WARN(&xdna->ddev, 1, "Can not get flush memory");
> + ret = amdxdna_flush_bo(abo, args->offset, args->size);
> + amdxdna_gem_unpin(abo);
>
> - amdxdna_gem_unpin(abo);
> + if (ret) {
> + drm_WARN(&xdna->ddev, 1, "Can not get flush memory");
> + goto put_obj;
> + }
> + }
>
> XDNA_DBG(xdna, "Sync bo %d offset 0x%llx, size 0x%llx\n",
> args->handle, args->offset, args->size);
> diff --git a/drivers/accel/amdxdna/amdxdna_gem.h b/drivers/accel/amdxdna/amdxdna_gem.h
> index 162e5499f5e0..ffdb18395a54 100644
> --- a/drivers/accel/amdxdna/amdxdna_gem.h
> +++ b/drivers/accel/amdxdna/amdxdna_gem.h
> @@ -46,8 +46,10 @@ struct amdxdna_gem_obj {
> int open_ref;
>
> /* Below members are initialized when needed */
> - struct drm_mm mm; /* For AMDXDNA_BO_DEV_HEAP */
> struct drm_mm_node mm_node; /* For AMDXDNA_BO_DEV */
> + u32 heap_start_id;
> + u32 heap_end_id;
> + u64 dev_addr; /* For heap bo */
> u32 assigned_hwctx;
> struct dma_buf *dma_buf;
> struct dma_buf_attachment *attach;
> @@ -73,13 +75,21 @@ static inline void amdxdna_gem_put_obj(struct amdxdna_gem_obj *abo)
> drm_gem_object_put(to_gobj(abo));
> }
>
> +/*
> + * Obtain the user virtual address for accessing the BO.
> + * It can be used for device to access the BO when PASID is enabled.
> + */
> +static inline u64 amdxdna_gem_uva(struct amdxdna_gem_obj *abo)
> +{
> + return abo->mem.uva;
> +}
> +
> void *amdxdna_gem_vmap(struct amdxdna_gem_obj *abo);
> -u64 amdxdna_gem_uva(struct amdxdna_gem_obj *abo);
> u64 amdxdna_gem_dev_addr(struct amdxdna_gem_obj *abo);
>
> static inline u64 amdxdna_dev_bo_offset(struct amdxdna_gem_obj *abo)
> {
> - return amdxdna_gem_dev_addr(abo) - amdxdna_gem_dev_addr(abo->client->dev_heap);
> + return amdxdna_gem_dev_addr(abo) - abo->client->xdna->dev_info->dev_mem_base;
> }
>
> static inline u64 amdxdna_obj_dma_addr(struct amdxdna_gem_obj *abo)
> diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
> index a6e9be7960c2..c677293c1ae7 100644
> --- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
> +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
> @@ -126,6 +126,9 @@ static int amdxdna_drm_open(struct drm_device *ddev, struct drm_file *filp)
> mmgrab(client->mm);
> init_srcu_struct(&client->hwctx_srcu);
> xa_init_flags(&client->hwctx_xa, XA_FLAGS_ALLOC);
> + xa_init_flags(&client->dev_heap_xa, XA_FLAGS_ALLOC);
> + drm_mm_init(&client->dev_heap_mm, xdna->dev_info->dev_mem_base,
> + xdna->dev_info->dev_heap_max_size);
> mutex_init(&client->mm_lock);
>
> mutex_lock(&xdna->dev_lock);
> @@ -141,13 +144,18 @@ static int amdxdna_drm_open(struct drm_device *ddev, struct drm_file *filp)
>
> static void amdxdna_client_cleanup(struct amdxdna_client *client)
> {
> + struct amdxdna_gem_obj *heap;
> + unsigned long heap_id;
> +
> list_del(&client->node);
> amdxdna_hwctx_remove_all(client);
> xa_destroy(&client->hwctx_xa);
> cleanup_srcu_struct(&client->hwctx_srcu);
>
> - if (client->dev_heap)
> - drm_gem_object_put(to_gobj(client->dev_heap));
> + xa_for_each(&client->dev_heap_xa, heap_id, heap)
> + drm_gem_object_put(to_gobj(heap));
> + xa_destroy(&client->dev_heap_xa);
> + drm_mm_takedown(&client->dev_heap_mm);
>
> mutex_destroy(&client->mm_lock);
> mmdrop(client->mm);
> diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.h b/drivers/accel/amdxdna/amdxdna_pci_drv.h
> index 471b72299aee..34271c14d359 100644
> --- a/drivers/accel/amdxdna/amdxdna_pci_drv.h
> +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.h
> @@ -7,6 +7,7 @@
> #define _AMDXDNA_PCI_DRV_H_
>
> #include <drm/amdxdna_accel.h>
> +#include <drm/drm_mm.h>
> #include <drm/drm_print.h>
> #include <linux/iommu.h>
> #include <linux/iova.h>
> @@ -61,6 +62,7 @@ struct amdxdna_dev_ops {
> void (*hwctx_fini)(struct amdxdna_hwctx *hwctx);
> int (*hwctx_config)(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
> int (*hwctx_sync_debug_bo)(struct amdxdna_hwctx *hwctx, u32 debug_bo_hdl);
> + int (*hwctx_heap_expand)(struct amdxdna_hwctx *hwctx, struct amdxdna_gem_obj *heap);
> void (*hmm_invalidate)(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
> int (*cmd_submit)(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
> int (*cmd_wait)(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
> @@ -95,6 +97,7 @@ struct amdxdna_dev_info {
> size_t dev_mem_size;
> const char *default_vbnv;
> const struct amdxdna_rev_vbnv *rev_vbnv_tbl;
> + size_t dev_heap_max_size;
> const struct amdxdna_dev_priv *dev_priv;
> const struct amdxdna_fw_feature_tbl *fw_feature_tbl;
> const struct amdxdna_dev_ops *ops;
> @@ -153,7 +156,10 @@ struct amdxdna_client {
> struct drm_file *filp;
>
> struct mutex mm_lock; /* protect memory related */
> - struct amdxdna_gem_obj *dev_heap;
> + struct xarray dev_heap_xa;
> + struct drm_mm dev_heap_mm;
> + u32 dev_heap_nid;
> + size_t total_heap_size;
>
> struct iommu_sva *sva;
> int pasid;
> diff --git a/drivers/accel/amdxdna/npu1_regs.c b/drivers/accel/amdxdna/npu1_regs.c
> index 4e48c030a69f..ca779674017a 100644
> --- a/drivers/accel/amdxdna/npu1_regs.c
> +++ b/drivers/accel/amdxdna/npu1_regs.c
> @@ -139,6 +139,7 @@ const struct amdxdna_dev_info dev_npu1_info = {
> .dev_mem_base = AIE2_DEVM_BASE,
> .dev_mem_size = AIE2_DEVM_SIZE,
> .default_vbnv = "RyzenAI-npu1",
> + .dev_heap_max_size = AIE2_DEVM_SIZE,
> .device_type = AMDXDNA_DEV_TYPE_KMQ,
> .dev_priv = &npu1_dev_priv,
> .fw_feature_tbl = npu1_fw_feature_table,
> diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
> index eddc31803a50..15a161384625 100644
> --- a/drivers/accel/amdxdna/npu4_regs.c
> +++ b/drivers/accel/amdxdna/npu4_regs.c
> @@ -98,6 +98,7 @@ const struct amdxdna_fw_feature_tbl npu4_fw_feature_table[] = {
> { .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor = 15 },
> { .features = BIT_U64(AIE2_UPDATE_PROPERTY), .major = 6, .min_minor = 15 },
> { .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor = 18 },
> + { .features = BIT_U64(AIE2_ADD_HOST_BUFFER), .major = 6, .min_minor = 18 },
> { .features = BIT_U64(AIE2_GET_DEV_REVISION), .major = 6, .min_minor = 24 },
> { .features = AIE2_ALL_FEATURES, .major = 7 },
> { 0 }
> @@ -200,6 +201,7 @@ const struct amdxdna_dev_info dev_npu4_info = {
> .dev_mem_base = AIE2_DEVM_BASE,
> .dev_mem_size = AIE2_DEVM_SIZE,
> .default_vbnv = "RyzenAI-npu4",
> + .dev_heap_max_size = AIE2_DEVM_MAX_SIZE,
> .device_type = AMDXDNA_DEV_TYPE_KMQ,
> .rev_vbnv_tbl = npu4_rev_vbnv_tbl,
> .dev_priv = &npu4_dev_priv,
> diff --git a/drivers/accel/amdxdna/npu5_regs.c b/drivers/accel/amdxdna/npu5_regs.c
> index a9102978e4a8..306b359d0cd3 100644
> --- a/drivers/accel/amdxdna/npu5_regs.c
> +++ b/drivers/accel/amdxdna/npu5_regs.c
> @@ -107,6 +107,7 @@ const struct amdxdna_dev_info dev_npu5_info = {
> .dev_mem_base = AIE2_DEVM_BASE,
> .dev_mem_size = AIE2_DEVM_SIZE,
> .default_vbnv = "RyzenAI-npu5",
> + .dev_heap_max_size = AIE2_DEVM_MAX_SIZE,
> .device_type = AMDXDNA_DEV_TYPE_KMQ,
> .rev_vbnv_tbl = npu4_rev_vbnv_tbl,
> .dev_priv = &npu5_dev_priv,
> diff --git a/drivers/accel/amdxdna/npu6_regs.c b/drivers/accel/amdxdna/npu6_regs.c
> index e0db3a09740b..e68637d2a228 100644
> --- a/drivers/accel/amdxdna/npu6_regs.c
> +++ b/drivers/accel/amdxdna/npu6_regs.c
> @@ -108,6 +108,7 @@ const struct amdxdna_dev_info dev_npu6_info = {
> .dev_mem_base = AIE2_DEVM_BASE,
> .dev_mem_size = AIE2_DEVM_SIZE,
> .default_vbnv = "RyzenAI-npu6",
> + .dev_heap_max_size = AIE2_DEVM_MAX_SIZE,
> .device_type = AMDXDNA_DEV_TYPE_KMQ,
> .rev_vbnv_tbl = npu4_rev_vbnv_tbl,
> .dev_priv = &npu6_dev_priv,
^ permalink raw reply [flat|nested] 4+ messages in thread* Claude review: accel/amdxdna: Add expandable device heap support
2026-05-15 16:19 [PATCH V2] accel/amdxdna: Add expandable device heap support Lizhi Hou
2026-05-15 21:01 ` Mario Limonciello
@ 2026-05-15 22:50 ` Claude Code Review Bot
2026-05-15 22:50 ` Claude Code Review Bot
2 siblings, 0 replies; 4+ messages in thread
From: Claude Code Review Bot @ 2026-05-15 22:50 UTC (permalink / raw)
To: dri-devel-reviews
Patch Review
**Locking: `amdxdna_hwctx_expand_heap` drops and re-takes `mm_lock` mid-iteration**
In `amdxdna_ctx.c`, `amdxdna_hwctx_expand_heap()` iterates the `dev_heap_xa` under `mm_lock`, but drops the lock to call `hwctx_heap_expand`:
```c
xa_for_each_range(&client->dev_heap_xa, heap_id, heap,
nid, client->dev_heap_nid) {
drm_gem_object_get(to_gobj(heap));
ret = amdxdna_gem_pin(heap);
...
mutex_unlock(&client->mm_lock);
ret = xdna->dev_info->ops->hwctx_heap_expand(hwctx, heap);
mutex_lock(&client->mm_lock);
...
```
Dropping `mm_lock` while iterating the xarray is dangerous. `client->dev_heap_nid` could change while the lock is dropped (another thread could add a new heap chunk). The `xa_for_each_range` macro caches its iteration state, but the upper bound `client->dev_heap_nid` is evaluated once. More importantly, if another concurrent call to `amdxdna_update_heap(client, NULL)` runs, you could get concurrent iteration + expansion on the same hwctx. The `dev_lock` is held in the `amdxdna_drm_create_bo_ioctl` path but not in the `aie2_hwctx_init` path, so this is a real concern.
**Off-by-one in `xa_for_each_range` upper bound**
In `aie2_hwctx_restart`:
```c
xa_for_each_range(&hwctx->client->dev_heap_xa, heap_id, heap, 1,
hwctx->last_attached_heap) {
```
And in `amdxdna_hwctx_release_expanded_heap`:
```c
xa_for_each_range(&client->dev_heap_xa, heap_id, heap, 1,
hwctx->last_attached_heap) {
```
These iterate from index 1 to `last_attached_heap` inclusive. But in `amdxdna_hwctx_expand_heap`, `last_attached_heap` is set to `heap_id` on success:
```c
hwctx->last_attached_heap = heap_id;
```
This looks correct for the restart/release paths — they replay from 1 (skipping the initial heap at 0) to the last expanded heap inclusive. However, `last_attached_heap` is initialized to 0 (by kzalloc), and the conditional in `amdxdna_hwctx_release_expanded_heap` checks `if (hwctx->last_attached_heap)` — this works but is fragile. If `last_attached_heap` were ever set to 0 for a different reason, the release path would skip cleanup.
**`amdxdna_update_heap` called from `amdxdna_drm_create_bo_ioctl` with wrong locking**
```c
case AMDXDNA_BO_DEV:
abo = amdxdna_drm_create_dev_bo(dev, args, filp);
if (!IS_ERR(abo)) {
mutex_lock(&xdna->dev_lock);
ret = amdxdna_update_heap(client, NULL);
mutex_unlock(&xdna->dev_lock);
if (ret)
goto put_obj;
}
```
`amdxdna_update_heap(client, NULL)` iterates all hwctxs with `amdxdna_for_each_hwctx`, which is `xa_for_each(&client->hwctx_xa, ...)`. In other callsites this xarray is iterated under SRCU read lock (`client->hwctx_srcu`), but here it's only under `dev_lock`. While `dev_lock` may prevent concurrent hwctx creation/destruction, the existing code consistently uses SRCU for hwctx xarray iteration. This is at minimum inconsistent and may be incorrect if other paths remove hwctxs without holding `dev_lock`.
**Missing pin in sync BO path for dev BOs**
The original `amdxdna_drm_sync_bo_ioctl` pinned the BO before flushing. In the new code, the AMDXDNA_BO_DEV path skips pinning entirely:
```c
if (abo->type == AMDXDNA_BO_DEV) {
...
xa_for_each_range(&client->dev_heap_xa, heap_id, heap, ...) {
...
ret = amdxdna_flush_bo(heap, start - heap_start, end - start);
```
The heap chunks are flushed without pinning them first. The original code pinned the dev BO (which redirected to pinning the heap). Without pinning, `amdxdna_gem_vmap()` or `abo->base.pages` may not be valid. The flush helper `amdxdna_flush_bo` tries `amdxdna_gem_vmap()` and `abo->base.pages`, but these may be NULL if the heap isn't pinned.
**`amdxdna_flush_bo` silently clamps size but logs no warning**
```c
static int amdxdna_flush_bo(struct amdxdna_gem_obj *abo, u64 offset, u64 size)
{
...
size = min(abo->mem.size, end) - offset;
if (is_import_bo(abo))
drm_clflush_sg(abo->base.sgt);
else if (amdxdna_gem_vmap(abo))
drm_clflush_virt_range(amdxdna_gem_vmap(abo) + offset, size);
else if (abo->base.pages)
drm_clflush_pages(abo->base.pages, abo->mem.size >> PAGE_SHIFT);
```
When the `pages` path is taken, it flushes `abo->mem.size >> PAGE_SHIFT` pages — the entire BO, ignoring `offset` and `size`. This was the same in the original code and is technically a pre-existing issue, but it's now been refactored into a helper that receives offset/size parameters suggesting it should respect them.
Also, the original code had `drm_WARN` for the "can not flush" case. The new `amdxdna_flush_bo` returns `-EINVAL` for that case, which changes the behavior from a warning to an ioctl error. The caller in the non-dev-BO path then issues `drm_WARN` on the error, which is fine. But the dev-BO path doesn't issue a `drm_WARN`, just logs an error — inconsistent.
**`aie2_send_host_buf_msgs` reuses `DECLARE_AIE_MSG` stack variables across loop iterations**
```c
static int aie2_send_host_buf_msgs(...)
{
DECLARE_AIE_MSG(map_host_buffer, MSG_OP_MAP_HOST_BUFFER);
...
msg.opcode = initial_opcode;
do {
req.context_id = context_id;
req.buf_addr = addr;
req.buf_size = chunk_size;
ret = aie_send_mgmt_msg_wait(&ndev->aie, &msg);
...
msg.opcode = MSG_OP_ADD_HOST_BUFFER;
} while (size);
```
The `DECLARE_AIE_MSG` macro declares `req` and `resp` as stack-initialized structures. This should be fine since `req` fields are fully re-assigned each iteration. However, the `msg` struct's `opcode` is overridden to `initial_opcode` before the loop, and set to `MSG_OP_ADD_HOST_BUFFER` after the first iteration. The `resp.status` is initialized to -1 by the macro, but after the first `aie_send_mgmt_msg_wait`, it may contain a different value — this should be re-initialized for subsequent iterations if the message handling checks it. Worth verifying.
**Error log removed from `amdxdna_drm_create_dev_bo`**
```c
ret = amdxdna_gem_heap_alloc(abo);
if (ret) {
- XDNA_ERR(xdna, "Failed to alloc dev bo memory, ret %d", ret);
amdxdna_gem_destroy_obj(abo);
return ERR_PTR(ret);
}
```
The error log was removed. Since `-EAGAIN` is now an expected return (signaling the need for heap expansion), this makes sense — but it means real allocation failures are also silent. Consider logging at `XDNA_DBG` level or only suppressing for `-EAGAIN`.
**`amdxdna_gem_uva` changed from function to inline but semantics changed**
The old `amdxdna_gem_uva` computed the UVA for `AMDXDNA_BO_DEV` objects dynamically from the heap's UVA + offset. The new version is a simple inline returning `abo->mem.uva`:
```c
static inline u64 amdxdna_gem_uva(struct amdxdna_gem_obj *abo)
{
return abo->mem.uva;
}
```
This means `mem.uva` must be correctly set for DEV BOs during `amdxdna_init_dev_bo`. Looking at that function:
```c
dev_bo->mem.uva = dev_bo->mm_node.start - heap_addr + exp_heap_uva;
```
This is set once at allocation time. If the heap's UVA changes after the dev BO is created (e.g., remap), the dev BO's UVA would be stale. In the old code, it was always computed live. This may be acceptable if heap UVA is immutable after mmap, but it's a subtle semantic change worth noting.
**`amdxdna_init_dev_bo` has a subtle underflow risk**
```c
for (heap_id = 0; heap_id < client->dev_heap_nid; heap_id++) {
heap = xa_load(&client->dev_heap_xa, heap_id);
...
heap_addr = amdxdna_gem_dev_addr(heap);
if (heap_addr > dev_bo->mm_node.start)
break;
}
heap_id--;
heap = xa_load(&client->dev_heap_xa, heap_id);
```
If `heap_id` is 0 and the first heap's `dev_addr` is already greater than `mm_node.start`, `heap_id` will be 0 when it's decremented, wrapping to `UINT_MAX` (it's `u32`). This would be a bug. This scenario shouldn't happen in practice since the first heap starts at `dev_mem_base` and the mm allocator also starts at `dev_mem_base`, but it's not defended against.
**`amdxdna_gem_pin_nolock` for DEV BOs doesn't hold `abo->lock`**
The new `amdxdna_gem_pin_nolock` for `AMDXDNA_BO_DEV` type iterates heap chunks and calls `amdxdna_bo_pin` on each. But `amdxdna_gem_pin` (the caller) takes `abo->lock` — the lock of the dev BO, not the heap BO. The underlying `drm_gem_shmem_pin` on the heap BO is called without the heap BO's lock. The old code redirected to `client->dev_heap` which was always the same object, and `amdxdna_gem_pin` would take its lock. Now the lock discipline has changed — the dev BO's lock is held but the individual heap BOs' locks are not.
**npu1 gets `dev_heap_max_size = AIE2_DEVM_SIZE` (64M) while others get `AIE2_DEVM_MAX_SIZE` (512M)**
This is intentional — npu1 is an older device that doesn't support heap expansion. But the code path that creates additional heap BOs doesn't check whether the firmware supports `MSG_OP_ADD_HOST_BUFFER` until `aie2_add_host_buf` is called. On npu1, `dev_heap_max_size` is 64M, same as `dev_mem_size`, so only one 64M chunk fits. This means expansion is effectively blocked by size, not by feature check. This works but is fragile — if someone changed the sizes without understanding this coupling, npu1 could try unsupported firmware commands.
**Summary**: The overall design is sound, but the locking around `amdxdna_hwctx_expand_heap` dropping `mm_lock` mid-xarray iteration needs careful scrutiny. The missing pin in the sync-BO path for dev BOs is likely a bug. The `heap_id--` underflow in `amdxdna_init_dev_bo` should be defended. The inconsistent lock discipline between `amdxdna_gem_pin_nolock` (holds dev BO lock but not heap BO locks) and the old code (which held the heap's lock) is a correctness concern.
---
Generated by Claude Code Patch Reviewer
^ permalink raw reply [flat|nested] 4+ messages in thread* Claude review: accel/amdxdna: Add expandable device heap support
2026-05-15 16:19 [PATCH V2] accel/amdxdna: Add expandable device heap support Lizhi Hou
2026-05-15 21:01 ` Mario Limonciello
2026-05-15 22:50 ` Claude review: " Claude Code Review Bot
@ 2026-05-15 22:50 ` Claude Code Review Bot
2 siblings, 0 replies; 4+ messages in thread
From: Claude Code Review Bot @ 2026-05-15 22:50 UTC (permalink / raw)
To: dri-devel-reviews
Overall Series Review
Subject: accel/amdxdna: Add expandable device heap support
Author: Lizhi Hou <lizhi.hou@amd.com>
Patches: 2
Reviewed: 2026-05-16T08:50:25.638886
---
This single patch converts the amdxdna driver from a single, fixed-size device heap per client to an expandable heap backed by an xarray of heap chunks. The design is reasonable: start with a small heap, grow on demand via userspace IOCTL, and register new chunks with firmware via `MSG_OP_ADD_HOST_BUFFER`. However, there are several locking concerns, a potential off-by-one in the xarray iteration range, missing pin/unpin symmetry for the sync BO path, and some questionable error handling changes that deserve attention.
The patch is a V2 fixing a memory leak and flush-BO scoping, but introduces new issues in those fixes and in the core heap expansion logic. It needs another revision.
---
---
Generated by Claude Code Patch Reviewer
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-05-15 22:50 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-15 16:19 [PATCH V2] accel/amdxdna: Add expandable device heap support Lizhi Hou
2026-05-15 21:01 ` Mario Limonciello
2026-05-15 22:50 ` Claude review: " Claude Code Review Bot
2026-05-15 22:50 ` Claude Code Review Bot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox