public inbox for drm-ai-reviews@public-inbox.freedesktop.org
 help / color / mirror / Atom feed
From: Zhiping Zhang <zhipingz@meta.com>
To: Alex Williamson <alex@shazbot.org>,
	Jason Gunthorpe <jgg@ziepe.ca>, Leon Romanovsky <leon@kernel.org>
Cc: Bjorn Helgaas <helgaas@kernel.org>, <kvm@vger.kernel.org>,
	<linux-rdma@vger.kernel.org>, <linux-pci@vger.kernel.org>,
	<netdev@vger.kernel.org>, <dri-devel@lists.freedesktop.org>,
	Keith Busch <kbusch@kernel.org>, Yochai Cohen <yochai@nvidia.com>,
	Yishai Hadas <yishaih@nvidia.com>,
	Zhiping Zhang <zhipingz@meta.com>
Subject: [PATCH v3 2/2] RDMA/mlx5: get tph for p2p access when registering dma-buf mr
Date: Tue, 12 May 2026 11:47:49 -0700	[thread overview]
Message-ID: <20260512184755.4137227-3-zhipingz@meta.com> (raw)
In-Reply-To: <20260512184755.4137227-1-zhipingz@meta.com>

Query dma-buf TPH metadata when registering a dma-buf MR for peer to
peer access and translate the raw steering tag into an mlx5 steering
tag index. Factor mlx5_st_alloc_index() so callers that already have a
raw steering tag can allocate the corresponding mlx5 index directly.
Keep the DMAH path as the first priority and only fall back to dma-buf
metadata when no DMAH is supplied.

Add pcie_tph_get_st_width() so the mlx5 IB driver can query the
device's negotiated ST width without poking pci_dev::tph_req_type
directly (that field is gated by CONFIG_PCIE_TPH and would otherwise
break !CONFIG_PCIE_TPH builds). Pass the width to the dma-buf
get_tph() callback so the exporter can return the value that matches
the consumer's capability.

Pass the dma_buf pointer that the umem already resolved into
get_tph_mr_dmabuf() instead of re-resolving the user-supplied fd.
Re-resolving opens a TOCTOU where a concurrent dup2() can substitute a
different dma_buf between umem creation and TPH lookup.

Track the per-MR ownership of the allocated mlx5 ST index on
mlx5_ib_mr (dmabuf_st_index / dmabuf_st_owned) and release it once the
firmware mkey no longer references it. Both the cached path
(mlx5r_umr_revoke_mr_with_lock + ib_frmr_pool_push) and the
destroy_mkey path call mlx5_ib_mr_put_dmabuf_st() so the ST index does
not leak when the MR is reused from the FRMR pool.

Initialize ret in mlx5_st_create() so the cached steering-tag path
returns success cleanly under clang builds.

Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h          |  6 ++
 drivers/infiniband/hw/mlx5/mr.c               | 72 ++++++++++++++++++-
 .../net/ethernet/mellanox/mlx5/core/lib/st.c  | 27 ++++---
 drivers/pci/tph.c                             | 20 ++++++
 include/linux/mlx5/driver.h                   |  7 ++
 include/linux/pci-tph.h                       |  2 +
 6 files changed, 124 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index e156dc4d7529..4ab867392267 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -721,6 +721,12 @@ struct mlx5_ib_mr {
 			u8 revoked :1;
 			/* Indicates previous dmabuf page fault occurred */
 			u8 dmabuf_faulted:1;
+			/* Set when the MR owns dmabuf_st_index and must
+			 * release it via mlx5_st_dealloc_index() once the
+			 * firmware mkey is no longer referencing it.
+			 */
+			u8 dmabuf_st_owned:1;
+			u16 dmabuf_st_index;
 			struct mlx5_ib_mkey null_mmkey;
 		};
 	};
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 3b6da45061a5..84d570f7cafb 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -38,6 +38,7 @@
 #include <linux/delay.h>
 #include <linux/dma-buf.h>
 #include <linux/dma-resv.h>
+#include <linux/pci-tph.h>
 #include <rdma/frmr_pools.h>
 #include <rdma/ib_umem_odp.h>
 #include "dm.h"
@@ -46,6 +47,8 @@
 #include "data_direct.h"
 #include "dmah.h"
 
+MODULE_IMPORT_NS("DMA_BUF");
+
 static int mkey_max_umr_order(struct mlx5_ib_dev *dev)
 {
 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
@@ -899,6 +902,54 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
 	.invalidate_mappings = mlx5_ib_dmabuf_invalidate_cb,
 };
 
+/*
+ * Query TPH metadata from @dmabuf and translate the raw steering tag into
+ * an mlx5 ST index. On success, returns 0 and the caller becomes the
+ * owner of *@st_index (must be released with mlx5_st_dealloc_index()
+ * once the firmware mkey no longer references it). On any failure
+ * *@st_index and *@ph are left as the no-TPH defaults set by the caller.
+ *
+ * @dmabuf must already be referenced by the caller (e.g. via the umem's
+ * attachment) so we don't re-resolve the user's fd here and avoid a
+ * dup2() TOCTOU between umem creation and TPH lookup.
+ */
+static void get_tph_mr_dmabuf(struct mlx5_ib_dev *dev, struct dma_buf *dmabuf,
+			      u16 *st_index, u8 *ph)
+{
+	u16 steering_tag;
+	u8 st_width;
+	int ret;
+
+	if (!dmabuf->ops->get_tph)
+		return;
+
+	st_width = pcie_tph_get_st_width(dev->mdev->pdev);
+	if (!st_width)
+		return;
+
+	ret = dmabuf->ops->get_tph(dmabuf, &steering_tag, ph, st_width);
+	if (ret) {
+		mlx5_ib_dbg(dev, "get_tph failed (%d)\n", ret);
+		*ph = MLX5_IB_NO_PH;
+		return;
+	}
+
+	ret = mlx5_st_alloc_index_by_tag(dev->mdev, steering_tag, st_index);
+	if (ret) {
+		*ph = MLX5_IB_NO_PH;
+		mlx5_ib_dbg(dev, "st_alloc_index_by_tag failed (%d)\n", ret);
+	}
+}
+
+static void mlx5_ib_mr_put_dmabuf_st(struct mlx5_ib_mr *mr)
+{
+	if (mr->umem && mr->dmabuf_st_owned) {
+		mlx5_st_dealloc_index(mr_to_mdev(mr)->mdev,
+				      mr->dmabuf_st_index);
+		mr->dmabuf_st_owned = 0;
+	}
+}
+
 static struct ib_mr *
 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
 		   u64 offset, u64 length, u64 virt_addr,
@@ -941,16 +992,26 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
 		ph = dmah->ph;
 		if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
 			st_index = mdmah->st_index;
+	} else {
+		get_tph_mr_dmabuf(dev, umem_dmabuf->attach->dmabuf,
+				  &st_index, &ph);
 	}
 
 	mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
 				access_flags, access_mode,
 				st_index, ph);
 	if (IS_ERR(mr)) {
+		if (!dmah && st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+			mlx5_st_dealloc_index(dev->mdev, st_index);
 		ib_umem_release(&umem_dmabuf->umem);
 		return ERR_CAST(mr);
 	}
 
+	if (!dmah && st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) {
+		mr->dmabuf_st_index = st_index;
+		mr->dmabuf_st_owned = 1;
+	}
+
 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
 
 	atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
@@ -1378,8 +1439,15 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
 	int ret;
 
 	if (mr->ibmr.frmr.pool && !mlx5_umr_revoke_mr_with_lock(mr) &&
-	    !ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr))
+	    !ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr)) {
+		/*
+		 * The mkey has been revoked: firmware no longer references
+		 * dmabuf_st_index, so release it before this mr re-enters
+		 * the FRMR cache for reuse by another registration.
+		 */
+		mlx5_ib_mr_put_dmabuf_st(mr);
 		return 0;
+	}
 
 	if (is_odp)
 		mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
@@ -1400,6 +1468,8 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
 		dma_resv_unlock(
 			to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
 	}
+	if (!ret)
+		mlx5_ib_mr_put_dmabuf_st(mr);
 	return ret;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
index 997be91f0a13..c5058557c7f0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
@@ -29,7 +29,7 @@ struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev)
 	u8 direct_mode = 0;
 	u16 num_entries;
 	u32 tbl_loc;
-	int ret;
+	int ret = 0;
 
 	if (!MLX5_CAP_GEN(dev, mkey_pcie_tph))
 		return NULL;
@@ -92,23 +92,18 @@ void mlx5_st_destroy(struct mlx5_core_dev *dev)
 	kfree(st);
 }
 
-int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
-			unsigned int cpu_uid, u16 *st_index)
+int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
+			       u16 *st_index)
 {
 	struct mlx5_st_idx_data *idx_data;
 	struct mlx5_st *st = dev->st;
 	unsigned long index;
 	u32 xa_id;
-	u16 tag;
-	int ret;
+	int ret = 0;
 
 	if (!st)
 		return -EOPNOTSUPP;
 
-	ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
-	if (ret)
-		return ret;
-
 	if (st->direct_mode) {
 		*st_index = tag;
 		return 0;
@@ -152,6 +147,20 @@ int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
 	mutex_unlock(&st->lock);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(mlx5_st_alloc_index_by_tag);
+
+int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
+			unsigned int cpu_uid, u16 *st_index)
+{
+	u16 tag;
+	int ret;
+
+	ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
+	if (ret)
+		return ret;
+
+	return mlx5_st_alloc_index_by_tag(dev, tag, st_index);
+}
 EXPORT_SYMBOL_GPL(mlx5_st_alloc_index);
 
 int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c
index 91145e8d9d95..644fb5b1f27c 100644
--- a/drivers/pci/tph.c
+++ b/drivers/pci/tph.c
@@ -174,6 +174,26 @@ u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL(pcie_tph_get_st_table_loc);
 
+/**
+ * pcie_tph_get_st_width - Return the device's negotiated Steering Tag width
+ * @pdev: PCI device to query
+ *
+ * Return: 16 if the TPH Requester is enabled in Extended TPH mode, 8 if
+ * enabled in regular TPH mode, 0 if TPH is not enabled or supported.
+ */
+u8 pcie_tph_get_st_width(struct pci_dev *pdev)
+{
+	switch (pdev->tph_req_type) {
+	case PCI_TPH_REQ_TPH_ONLY:
+		return 8;
+	case PCI_TPH_REQ_EXT_TPH:
+		return 16;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(pcie_tph_get_st_width);
+
 /*
  * Return the size of ST table. If ST table is not in TPH Requester Extended
  * Capability space, return 0. Otherwise return the ST Table Size + 1.
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 04b96c5abb57..523a9ab0ae1e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1166,10 +1166,17 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type
 			   u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
 
 #ifdef CONFIG_PCIE_TPH
+int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
+			       u16 *st_index);
 int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
 			unsigned int cpu_uid, u16 *st_index);
 int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index);
 #else
+static inline int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev,
+					     u16 tag, u16 *st_index)
+{
+	return -EOPNOTSUPP;
+}
 static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev,
 				      enum tph_mem_type mem_type,
 				      unsigned int cpu_uid, u16 *st_index)
diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h
index be68cd17f2f8..679f94f68cef 100644
--- a/include/linux/pci-tph.h
+++ b/include/linux/pci-tph.h
@@ -30,6 +30,7 @@ void pcie_disable_tph(struct pci_dev *pdev);
 int pcie_enable_tph(struct pci_dev *pdev, int mode);
 u16 pcie_tph_get_st_table_size(struct pci_dev *pdev);
 u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev);
+u8 pcie_tph_get_st_width(struct pci_dev *pdev);
 #else
 static inline int pcie_tph_set_st_entry(struct pci_dev *pdev,
 					unsigned int index, u16 tag)
@@ -41,6 +42,7 @@ static inline int pcie_tph_get_cpu_st(struct pci_dev *dev,
 static inline void pcie_disable_tph(struct pci_dev *pdev) { }
 static inline int pcie_enable_tph(struct pci_dev *pdev, int mode)
 { return -EINVAL; }
+static inline u8 pcie_tph_get_st_width(struct pci_dev *pdev) { return 0; }
 #endif
 
 #endif /* LINUX_PCI_TPH_H */
-- 
2.52.0


  parent reply	other threads:[~2026-05-12 18:59 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-12 18:47 [PATCH v3 0/2] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
2026-05-12 18:47 ` [PATCH v3 1/2] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature Zhiping Zhang
2026-05-13  1:33   ` fengchengwen
2026-05-14  6:08     ` Zhiping Zhang
2026-05-16  3:07   ` Claude review: " Claude Code Review Bot
2026-05-12 18:47 ` Zhiping Zhang [this message]
2026-05-13  1:49   ` [PATCH v3 2/2] RDMA/mlx5: get tph for p2p access when registering dma-buf mr fengchengwen
2026-05-13  6:37     ` Zhiping Zhang
2026-05-16  3:07   ` Claude review: " Claude Code Review Bot
2026-05-16  3:07 ` Claude review: vfio/dma-buf: add TPH support for peer-to-peer access Claude Code Review Bot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260512184755.4137227-3-zhipingz@meta.com \
    --to=zhipingz@meta.com \
    --cc=alex@shazbot.org \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=helgaas@kernel.org \
    --cc=jgg@ziepe.ca \
    --cc=kbusch@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=yishaih@nvidia.com \
    --cc=yochai@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox