* [PATCH v4 1/3] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature
2026-05-19 20:13 [PATCH v4 0/3] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
@ 2026-05-19 20:13 ` Zhiping Zhang
2026-05-21 22:04 ` Alex Williamson
2026-05-25 12:27 ` Claude review: " Claude Code Review Bot
2026-05-19 20:13 ` [PATCH v4 2/3] PCI/TPH: expose the enabled TPH requester type Zhiping Zhang
` (2 subsequent siblings)
3 siblings, 2 replies; 14+ messages in thread
From: Zhiping Zhang @ 2026-05-19 20:13 UTC (permalink / raw)
To: Alex Williamson, Jason Gunthorpe, Leon Romanovsky
Cc: Bjorn Helgaas, kvm, linux-rdma, linux-pci, netdev, dri-devel,
Keith Busch, Yochai Cohen, Yishai Hadas, Zhiping Zhang
Add a dma-buf get_tph callback for exporters to return TPH
(TLP Processing Hints) metadata, and add VFIO_DEVICE_FEATURE_DMA_BUF_TPH
so userspace can attach that metadata to a VFIO-exported dma-buf.
8-bit ST and 16-bit Extended ST are distinct PCIe TPH namespaces; the
uAPI carries both with explicit validity flags so importers get the
value matching their requested width. SET is write-once per dma-buf;
the existing VFIO_DEVICE_FEATURE_DMA_BUF uAPI is unchanged.
Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
---
drivers/vfio/pci/vfio_pci_core.c | 3 +
drivers/vfio/pci/vfio_pci_dmabuf.c | 134 +++++++++++++++++++++++++++--
drivers/vfio/pci/vfio_pci_priv.h | 12 +++
include/linux/dma-buf.h | 21 +++++
include/uapi/linux/vfio.h | 35 ++++++++
5 files changed, 198 insertions(+), 7 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 3f8d093aacf8..94aa6dd95701 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1534,6 +1534,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
case VFIO_DEVICE_FEATURE_DMA_BUF:
return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
+ case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
+ return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
+ argsz);
default:
return -ENOTTY;
}
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index f87fd32e4a01..be1c65385670 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -19,7 +19,24 @@ struct vfio_pci_dma_buf {
u32 nr_ranges;
struct kref kref;
struct completion comp;
- u8 revoked : 1;
+ /*
+ * TPH metadata published by VFIO_DEVICE_FEATURE_DMA_BUF_TPH and
+ * consumed by the @get_tph dma-buf callback.
+ *
+ * @tph_flags is the publish/consume gate: writers populate
+ * @steering_tag, @steering_tag_ext and @ph first, then store
+ * @tph_flags with smp_store_release(); readers do
+ * smp_load_acquire(&tph_flags) before accessing the value fields.
+ * @tph_flags == 0 means "TPH not set". Writers publish a non-zero
+ * value only once per dma-buf and serialize via vdev->memory_lock;
+ * readers stay lockless to avoid AB-BA against the dma_resv_lock held
+ * by importers.
+ */
+ u32 tph_flags;
+ u16 steering_tag_ext;
+ u8 steering_tag;
+ u8 ph;
+ bool revoked;
};
static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
@@ -69,6 +86,36 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
return ret;
}
+static int vfio_pci_dma_buf_get_tph(struct dma_buf *dmabuf, u16 *steering_tag,
+ u8 *ph, u8 st_width)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+ u32 flags;
+
+ /* Pair with the smp_store_release() in VFIO_DEVICE_FEATURE_DMA_BUF_TPH. */
+ flags = smp_load_acquire(&priv->tph_flags);
+ if (!flags)
+ return -EOPNOTSUPP;
+
+ switch (st_width) {
+ case 8:
+ if (!(flags & VFIO_DMA_BUF_TPH_ST))
+ return -EOPNOTSUPP;
+ *steering_tag = priv->steering_tag;
+ break;
+ case 16:
+ if (!(flags & VFIO_DMA_BUF_TPH_ST_EXT))
+ return -EOPNOTSUPP;
+ *steering_tag = priv->steering_tag_ext;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ *ph = priv->ph;
+ return 0;
+}
+
static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
struct sg_table *sgt,
enum dma_data_direction dir)
@@ -84,16 +131,17 @@ static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
{
struct vfio_pci_dma_buf *priv = dmabuf->priv;
+ struct vfio_pci_core_device *vdev = READ_ONCE(priv->vdev);
/*
* Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
* The refcount prevents both.
*/
- if (priv->vdev) {
- down_write(&priv->vdev->memory_lock);
+ if (vdev) {
+ down_write(&vdev->memory_lock);
list_del_init(&priv->dmabufs_elm);
- up_write(&priv->vdev->memory_lock);
- vfio_device_put_registration(&priv->vdev->vdev);
+ up_write(&vdev->memory_lock);
+ vfio_device_put_registration(&vdev->vdev);
}
kfree(priv->phys_vec);
kfree(priv);
@@ -101,6 +149,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
.attach = vfio_pci_dma_buf_attach,
+ .get_tph = vfio_pci_dma_buf_get_tph,
.map_dma_buf = vfio_pci_dma_buf_map,
.unmap_dma_buf = vfio_pci_dma_buf_unmap,
.release = vfio_pci_dma_buf_release,
@@ -269,7 +318,7 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
goto err_free_priv;
}
- priv->vdev = vdev;
+ WRITE_ONCE(priv->vdev, vdev);
priv->nr_ranges = get_dma_buf.nr_ranges;
priv->size = length;
ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider,
@@ -331,6 +380,77 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
return ret;
}
+int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
+ u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz)
+{
+ struct vfio_device_feature_dma_buf_tph set_tph;
+ struct vfio_pci_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ int ret;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
+ sizeof(set_tph));
+ if (ret != 1)
+ return ret;
+
+ if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
+ return -EFAULT;
+
+ if (set_tph.flags & ~(VFIO_DMA_BUF_TPH_ST | VFIO_DMA_BUF_TPH_ST_EXT))
+ return -EINVAL;
+
+ if (!set_tph.flags)
+ return -EINVAL;
+
+ /* PCIe TLP Processing Hint is a 2-bit field. */
+ if (set_tph.ph & ~0x3)
+ return -EINVAL;
+
+ dmabuf = dma_buf_get(set_tph.dmabuf_fd);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+
+ if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ priv = dmabuf->priv;
+ down_write(&vdev->memory_lock);
+ if (READ_ONCE(priv->vdev) != vdev) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * TPH metadata is write-once per dma-buf so that lockless readers only
+ * have to observe a single release-published transition from 0 -> flags.
+ */
+ if (READ_ONCE(priv->tph_flags)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ priv->steering_tag = set_tph.steering_tag;
+ priv->steering_tag_ext = set_tph.steering_tag_ext;
+ priv->ph = set_tph.ph;
+ /*
+ * Publish the TPH values before the gate flag, so that lockless
+ * readers in vfio_pci_dma_buf_get_tph() see fully-initialized
+ * fields once they observe a non-zero tph_flags.
+ */
+ smp_store_release(&priv->tph_flags, set_tph.flags);
+ ret = 0;
+
+out_unlock:
+ up_write(&vdev->memory_lock);
+out_put:
+ dma_buf_put(dmabuf);
+ return ret;
+}
+
void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
{
struct vfio_pci_dma_buf *priv;
@@ -388,7 +508,7 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
dma_resv_lock(priv->dmabuf->resv, NULL);
list_del_init(&priv->dmabufs_elm);
- priv->vdev = NULL;
+ WRITE_ONCE(priv->vdev, NULL);
priv->revoked = true;
dma_buf_invalidate_mappings(priv->dmabuf);
dma_resv_wait_timeout(priv->dmabuf->resv,
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index fca9d0dfac90..c58f369be4b3 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -118,6 +118,10 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
struct vfio_device_feature_dma_buf __user *arg,
size_t argsz);
+int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
+ u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz);
void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
#else
@@ -128,6 +132,14 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
{
return -ENOTTY;
}
+
+static inline int
+vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz)
+{
+ return -ENOTTY;
+}
static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
{
}
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index d1203da56fc5..49eb6ad644a2 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -113,6 +113,27 @@ struct dma_buf_ops {
*/
void (*unpin)(struct dma_buf_attachment *attach);
+ /**
+ * @get_tph:
+ * @dmabuf: DMA buffer for which to retrieve TPH metadata
+ * @steering_tag: Returns the raw TPH steering tag for @st_width
+ * @ph: Returns the TPH processing hint (2-bit value)
+ * @st_width: Consumer's supported steering tag width in bits (8 or 16)
+ *
+ * Return the TPH (TLP Processing Hints) metadata associated with this
+ * DMA buffer for the requested steering-tag width. 8-bit ST and 16-bit
+ * Extended ST are distinct namespaces in the PCIe TPH ST table and may
+ * both be present with different values, so the exporter must select the
+ * value that matches @st_width and must not substitute one for the other.
+ *
+ * Return 0 on success, -EOPNOTSUPP if no metadata is available for the
+ * requested width, or -EINVAL if @st_width is not 8 or 16.
+ *
+ * This callback is optional.
+ */
+ int (*get_tph)(struct dma_buf *dmabuf, u16 *steering_tag, u8 *ph,
+ u8 st_width);
+
/**
* @map_dma_buf:
*
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5de618a3a5ee..a9cb6cbc6ade 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1534,6 +1534,41 @@ struct vfio_device_feature_dma_buf {
*/
#define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
+/**
+ * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
+ * with a vfio-exported dma-buf. The dma-buf must have been created by
+ * VFIO_DEVICE_FEATURE_DMA_BUF on this device.
+ *
+ * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
+ *
+ * 8-bit ST (steering_tag) and 16-bit Extended ST (steering_tag_ext) are
+ * distinct namespaces in the PCIe TPH ST table and may both be present with
+ * different values. Userspace should populate the value(s) it has from the
+ * firmware ST table for this device and set the matching VFIO_DMA_BUF_TPH_ST /
+ * VFIO_DMA_BUF_TPH_ST_EXT bit in @flags. An importer requests a specific
+ * width and receives the matching value; if the requested width is not
+ * present, the importer is told TPH is unavailable for this dma-buf.
+ *
+ * ph is the 2-bit TLP Processing Hint and must be in the range [0, 3].
+ *
+ * The user must set TPH on the dma-buf before the importer consumes it.
+ * TPH metadata is write-once per dma-buf; a second SET returns -EBUSY.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13
+
+#define VFIO_DMA_BUF_TPH_ST (1 << 0) /* steering_tag valid */
+#define VFIO_DMA_BUF_TPH_ST_EXT (1 << 1) /* steering_tag_ext valid */
+
+struct vfio_device_feature_dma_buf_tph {
+ __s32 dmabuf_fd;
+ __u32 flags;
+ __u8 steering_tag;
+ __u8 ph;
+ __u16 steering_tag_ext;
+};
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH v4 1/3] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature
2026-05-19 20:13 ` [PATCH v4 1/3] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature Zhiping Zhang
@ 2026-05-21 22:04 ` Alex Williamson
2026-05-21 22:24 ` Alex Williamson
2026-05-22 23:53 ` Zhiping Zhang
2026-05-25 12:27 ` Claude review: " Claude Code Review Bot
1 sibling, 2 replies; 14+ messages in thread
From: Alex Williamson @ 2026-05-21 22:04 UTC (permalink / raw)
To: Zhiping Zhang
Cc: Jason Gunthorpe, Leon Romanovsky, Bjorn Helgaas, kvm, linux-rdma,
linux-pci, netdev, dri-devel, Keith Busch, Yochai Cohen,
Yishai Hadas, alex
On Tue, 19 May 2026 13:13:49 -0700
Zhiping Zhang <zhipingz@meta.com> wrote:
> Add a dma-buf get_tph callback for exporters to return TPH
> (TLP Processing Hints) metadata, and add VFIO_DEVICE_FEATURE_DMA_BUF_TPH
> so userspace can attach that metadata to a VFIO-exported dma-buf.
This should be two patches, the first extending the dma-buf framework
for the get_tph callback for explicit approval from dma-buf maintainers
(who are not even copied here). The second the vfio-pci implementation
of get_tph.
> 8-bit ST and 16-bit Extended ST are distinct PCIe TPH namespaces; the
> uAPI carries both with explicit validity flags so importers get the
> value matching their requested width. SET is write-once per dma-buf;
> the existing VFIO_DEVICE_FEATURE_DMA_BUF uAPI is unchanged.
I didn't see what motivated this write-once change, I thought we
understood that it was a userspace problem that the tph values need to
be set before providing the dma-buf fd to the importer and that races
relative to that are a userspace ordering problem. Write-once seems
unnecessarily restrictive and there's no justification provided here.
> Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
> ---
> drivers/vfio/pci/vfio_pci_core.c | 3 +
> drivers/vfio/pci/vfio_pci_dmabuf.c | 134 +++++++++++++++++++++++++++--
> drivers/vfio/pci/vfio_pci_priv.h | 12 +++
> include/linux/dma-buf.h | 21 +++++
> include/uapi/linux/vfio.h | 35 ++++++++
> 5 files changed, 198 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 3f8d093aacf8..94aa6dd95701 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1534,6 +1534,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> case VFIO_DEVICE_FEATURE_DMA_BUF:
> return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
> + case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
> + return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
> + argsz);
> default:
> return -ENOTTY;
> }
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> index f87fd32e4a01..be1c65385670 100644
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -19,7 +19,24 @@ struct vfio_pci_dma_buf {
> u32 nr_ranges;
> struct kref kref;
> struct completion comp;
> - u8 revoked : 1;
> + /*
> + * TPH metadata published by VFIO_DEVICE_FEATURE_DMA_BUF_TPH and
> + * consumed by the @get_tph dma-buf callback.
> + *
> + * @tph_flags is the publish/consume gate: writers populate
> + * @steering_tag, @steering_tag_ext and @ph first, then store
> + * @tph_flags with smp_store_release(); readers do
> + * smp_load_acquire(&tph_flags) before accessing the value fields.
> + * @tph_flags == 0 means "TPH not set". Writers publish a non-zero
> + * value only once per dma-buf and serialize via vdev->memory_lock;
> + * readers stay lockless to avoid AB-BA against the dma_resv_lock held
> + * by importers.
> + */
Can you outline the ABBA hazard, I'm not seeing it. You're acquiring
memory_lock in the feature SET and dma_resv_lock doesn't appear to be
held when calling .get_tph(). There's a lot of lockless complication
here balanced on this claim of avoiding a hazard that doesn't appear
present.
> + u32 tph_flags;
> + u16 steering_tag_ext;
> + u8 steering_tag;
> + u8 ph;
> + bool revoked;
If we still used memory_lock for tph, these could be:
u8 tph_st_valid:1; /* memory_lock */
u8 tph_st_ext_valid:1; /* memory_lock */
u8 tph_ph:2; /* memory_lock */
u8 tph_st;
u16 tph_st_ext;
u8 revoked:1; /* dma_resv_lock */
The existing change of @revoked from bitfield to bool has no rationale
noted for it in the commit log.
> };
>
> static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> @@ -69,6 +86,36 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> return ret;
> }
>
> +static int vfio_pci_dma_buf_get_tph(struct dma_buf *dmabuf, u16 *steering_tag,
> + u8 *ph, u8 st_width)
> +{
> + struct vfio_pci_dma_buf *priv = dmabuf->priv;
> + u32 flags;
> +
> + /* Pair with the smp_store_release() in VFIO_DEVICE_FEATURE_DMA_BUF_TPH. */
> + flags = smp_load_acquire(&priv->tph_flags);
> + if (!flags)
> + return -EOPNOTSUPP;
> +
> + switch (st_width) {
> + case 8:
> + if (!(flags & VFIO_DMA_BUF_TPH_ST))
> + return -EOPNOTSUPP;
> + *steering_tag = priv->steering_tag;
> + break;
> + case 16:
> + if (!(flags & VFIO_DMA_BUF_TPH_ST_EXT))
> + return -EOPNOTSUPP;
> + *steering_tag = priv->steering_tag_ext;
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> + *ph = priv->ph;
> + return 0;
> +}
> +
> static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> struct sg_table *sgt,
> enum dma_data_direction dir)
> @@ -84,16 +131,17 @@ static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> {
> struct vfio_pci_dma_buf *priv = dmabuf->priv;
> + struct vfio_pci_core_device *vdev = READ_ONCE(priv->vdev);
>
> /*
> * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
> * The refcount prevents both.
> */
> - if (priv->vdev) {
> - down_write(&priv->vdev->memory_lock);
> + if (vdev) {
> + down_write(&vdev->memory_lock);
> list_del_init(&priv->dmabufs_elm);
> - up_write(&priv->vdev->memory_lock);
> - vfio_device_put_registration(&priv->vdev->vdev);
> + up_write(&vdev->memory_lock);
> + vfio_device_put_registration(&vdev->vdev);
> }
> kfree(priv->phys_vec);
> kfree(priv);
This seems unnecessary. I think this is just because priv->vdev is now
(unnecessarily) set via WRITE_ONCE, right? These are very well ordered
paths, prior to exposing the dma-buf, while the device is opened, during
release, after release. They don't seem to need the READ/WRITE_ONCE
treatment. This looks like noise from trying to make it lockless.
> @@ -101,6 +149,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
>
> static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
> .attach = vfio_pci_dma_buf_attach,
> + .get_tph = vfio_pci_dma_buf_get_tph,
> .map_dma_buf = vfio_pci_dma_buf_map,
> .unmap_dma_buf = vfio_pci_dma_buf_unmap,
> .release = vfio_pci_dma_buf_release,
> @@ -269,7 +318,7 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> goto err_free_priv;
> }
>
> - priv->vdev = vdev;
> + WRITE_ONCE(priv->vdev, vdev);
> priv->nr_ranges = get_dma_buf.nr_ranges;
> priv->size = length;
> ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider,
> @@ -331,6 +380,77 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> return ret;
> }
>
> +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> + u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz)
> +{
> + struct vfio_device_feature_dma_buf_tph set_tph;
> + struct vfio_pci_dma_buf *priv;
> + struct dma_buf *dmabuf;
> + int ret;
> +
> + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
> + sizeof(set_tph));
> + if (ret != 1)
> + return ret;
> +
> + if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
> + return -EFAULT;
> +
> + if (set_tph.flags & ~(VFIO_DMA_BUF_TPH_ST | VFIO_DMA_BUF_TPH_ST_EXT))
> + return -EINVAL;
> +
> + if (!set_tph.flags)
> + return -EINVAL;
> +
> + /* PCIe TLP Processing Hint is a 2-bit field. */
> + if (set_tph.ph & ~0x3)
> + return -EINVAL;
> +
> + dmabuf = dma_buf_get(set_tph.dmabuf_fd);
> + if (IS_ERR(dmabuf))
> + return PTR_ERR(dmabuf);
> +
> + if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
> + ret = -EINVAL;
> + goto out_put;
> + }
> +
> + priv = dmabuf->priv;
> + down_write(&vdev->memory_lock);
> + if (READ_ONCE(priv->vdev) != vdev) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + /*
> + * TPH metadata is write-once per dma-buf so that lockless readers only
> + * have to observe a single release-published transition from 0 -> flags.
> + */
> + if (READ_ONCE(priv->tph_flags)) {
> + ret = -EBUSY;
> + goto out_unlock;
> + }
> +
> + priv->steering_tag = set_tph.steering_tag;
> + priv->steering_tag_ext = set_tph.steering_tag_ext;
> + priv->ph = set_tph.ph;
> + /*
> + * Publish the TPH values before the gate flag, so that lockless
> + * readers in vfio_pci_dma_buf_get_tph() see fully-initialized
> + * fields once they observe a non-zero tph_flags.
> + */
> + smp_store_release(&priv->tph_flags, set_tph.flags);
> + ret = 0;
> +
> +out_unlock:
> + up_write(&vdev->memory_lock);
> +out_put:
> + dma_buf_put(dmabuf);
> + return ret;
> +}
> +
> void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> {
> struct vfio_pci_dma_buf *priv;
> @@ -388,7 +508,7 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
>
> dma_resv_lock(priv->dmabuf->resv, NULL);
> list_del_init(&priv->dmabufs_elm);
> - priv->vdev = NULL;
> + WRITE_ONCE(priv->vdev, NULL);
> priv->revoked = true;
> dma_buf_invalidate_mappings(priv->dmabuf);
> dma_resv_wait_timeout(priv->dmabuf->resv,
> diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
> index fca9d0dfac90..c58f369be4b3 100644
> --- a/drivers/vfio/pci/vfio_pci_priv.h
> +++ b/drivers/vfio/pci/vfio_pci_priv.h
> @@ -118,6 +118,10 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
> int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> struct vfio_device_feature_dma_buf __user *arg,
> size_t argsz);
> +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> + u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz);
> void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
> void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
> #else
> @@ -128,6 +132,14 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> {
> return -ENOTTY;
> }
> +
> +static inline int
> +vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev, u32 flags,
> + struct vfio_device_feature_dma_buf_tph __user *arg,
> + size_t argsz)
> +{
> + return -ENOTTY;
> +}
> static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> {
> }
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index d1203da56fc5..49eb6ad644a2 100644
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -113,6 +113,27 @@ struct dma_buf_ops {
> */
> void (*unpin)(struct dma_buf_attachment *attach);
>
> + /**
> + * @get_tph:
> + * @dmabuf: DMA buffer for which to retrieve TPH metadata
> + * @steering_tag: Returns the raw TPH steering tag for @st_width
> + * @ph: Returns the TPH processing hint (2-bit value)
> + * @st_width: Consumer's supported steering tag width in bits (8 or 16)
> + *
> + * Return the TPH (TLP Processing Hints) metadata associated with this
> + * DMA buffer for the requested steering-tag width. 8-bit ST and 16-bit
> + * Extended ST are distinct namespaces in the PCIe TPH ST table and may
> + * both be present with different values, so the exporter must select the
> + * value that matches @st_width and must not substitute one for the other.
> + *
> + * Return 0 on success, -EOPNOTSUPP if no metadata is available for the
> + * requested width, or -EINVAL if @st_width is not 8 or 16.
> + *
> + * This callback is optional.
> + */
> + int (*get_tph)(struct dma_buf *dmabuf, u16 *steering_tag, u8 *ph,
> + u8 st_width);
> +
> /**
> * @map_dma_buf:
> *
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 5de618a3a5ee..a9cb6cbc6ade 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1534,6 +1534,41 @@ struct vfio_device_feature_dma_buf {
> */
> #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
>
> +/**
> + * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
> + * with a vfio-exported dma-buf. The dma-buf must have been created by
> + * VFIO_DEVICE_FEATURE_DMA_BUF on this device.
> + *
> + * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
> + *
> + * 8-bit ST (steering_tag) and 16-bit Extended ST (steering_tag_ext) are
> + * distinct namespaces in the PCIe TPH ST table and may both be present with
> + * different values. Userspace should populate the value(s) it has from the
> + * firmware ST table for this device and set the matching VFIO_DMA_BUF_TPH_ST /
> + * VFIO_DMA_BUF_TPH_ST_EXT bit in @flags. An importer requests a specific
> + * width and receives the matching value; if the requested width is not
> + * present, the importer is told TPH is unavailable for this dma-buf.
> + *
> + * ph is the 2-bit TLP Processing Hint and must be in the range [0, 3].
> + *
> + * The user must set TPH on the dma-buf before the importer consumes it.
> + * TPH metadata is write-once per dma-buf; a second SET returns -EBUSY.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13
> +
> +#define VFIO_DMA_BUF_TPH_ST (1 << 0) /* steering_tag valid */
> +#define VFIO_DMA_BUF_TPH_ST_EXT (1 << 1) /* steering_tag_ext valid */
> +
> +struct vfio_device_feature_dma_buf_tph {
> + __s32 dmabuf_fd;
> + __u32 flags;
> + __u8 steering_tag;
> + __u8 ph;
> + __u16 steering_tag_ext;
> +};
Sure is tempting to make the ph field the first 2-bits of u8 flags.
Thanks,
Alex
> +
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v4 1/3] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature
2026-05-21 22:04 ` Alex Williamson
@ 2026-05-21 22:24 ` Alex Williamson
2026-05-23 1:03 ` Zhiping Zhang
2026-05-22 23:53 ` Zhiping Zhang
1 sibling, 1 reply; 14+ messages in thread
From: Alex Williamson @ 2026-05-21 22:24 UTC (permalink / raw)
To: Zhiping Zhang
Cc: Jason Gunthorpe, Leon Romanovsky, Bjorn Helgaas, kvm, linux-rdma,
linux-pci, netdev, dri-devel, Keith Busch, Yochai Cohen,
Yishai Hadas, alex
On Thu, 21 May 2026 16:04:12 -0600
Alex Williamson <alex@shazbot.org> wrote:
> On Tue, 19 May 2026 13:13:49 -0700
> Zhiping Zhang <zhipingz@meta.com> wrote:
>
> > Add a dma-buf get_tph callback for exporters to return TPH
> > (TLP Processing Hints) metadata, and add VFIO_DEVICE_FEATURE_DMA_BUF_TPH
> > so userspace can attach that metadata to a VFIO-exported dma-buf.
>
> This should be two patches, the first extending the dma-buf framework
> for the get_tph callback for explicit approval from dma-buf maintainers
> (who are not even copied here). The second the vfio-pci implementation
> of get_tph.
>
> > 8-bit ST and 16-bit Extended ST are distinct PCIe TPH namespaces; the
> > uAPI carries both with explicit validity flags so importers get the
> > value matching their requested width. SET is write-once per dma-buf;
> > the existing VFIO_DEVICE_FEATURE_DMA_BUF uAPI is unchanged.
>
> I didn't see what motivated this write-once change, I thought we
> understood that it was a userspace problem that the tph values need to
> be set before providing the dma-buf fd to the importer and that races
> relative to that are a userspace ordering problem. Write-once seems
> unnecessarily restrictive and there's no justification provided here.
>
> > Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
> > ---
> > drivers/vfio/pci/vfio_pci_core.c | 3 +
> > drivers/vfio/pci/vfio_pci_dmabuf.c | 134 +++++++++++++++++++++++++++--
> > drivers/vfio/pci/vfio_pci_priv.h | 12 +++
> > include/linux/dma-buf.h | 21 +++++
> > include/uapi/linux/vfio.h | 35 ++++++++
> > 5 files changed, 198 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 3f8d093aacf8..94aa6dd95701 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -1534,6 +1534,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> > return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> > case VFIO_DEVICE_FEATURE_DMA_BUF:
> > return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
> > + case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
> > + return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
> > + argsz);
> > default:
> > return -ENOTTY;
> > }
> > diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > index f87fd32e4a01..be1c65385670 100644
> > --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> > +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > @@ -19,7 +19,24 @@ struct vfio_pci_dma_buf {
> > u32 nr_ranges;
> > struct kref kref;
> > struct completion comp;
> > - u8 revoked : 1;
> > + /*
> > + * TPH metadata published by VFIO_DEVICE_FEATURE_DMA_BUF_TPH and
> > + * consumed by the @get_tph dma-buf callback.
> > + *
> > + * @tph_flags is the publish/consume gate: writers populate
> > + * @steering_tag, @steering_tag_ext and @ph first, then store
> > + * @tph_flags with smp_store_release(); readers do
> > + * smp_load_acquire(&tph_flags) before accessing the value fields.
> > + * @tph_flags == 0 means "TPH not set". Writers publish a non-zero
> > + * value only once per dma-buf and serialize via vdev->memory_lock;
> > + * readers stay lockless to avoid AB-BA against the dma_resv_lock held
> > + * by importers.
> > + */
>
> Can you outline the ABBA hazard, I'm not seeing it. You're acquiring
> memory_lock in the feature SET and dma_resv_lock doesn't appear to be
> held when calling .get_tph(). There's a lot of lockless complication
> here balanced on this claim of avoiding a hazard that doesn't appear
> present.
>
> > + u32 tph_flags;
> > + u16 steering_tag_ext;
> > + u8 steering_tag;
> > + u8 ph;
> > + bool revoked;
>
> If we still used memory_lock for tph, these could be:
>
> u8 tph_st_valid:1; /* memory_lock */
> u8 tph_st_ext_valid:1; /* memory_lock */
> u8 tph_ph:2; /* memory_lock */
> u8 tph_st;
> u16 tph_st_ext;
> u8 revoked:1; /* dma_resv_lock */
>
> The existing change of @revoked from bitfield to bool has no rationale
> noted for it in the commit log.
On second thought, what dependency does anything here have on
memory_lock? I think we're jumping through hoops to avoid a lock we
don't even need. If we just want to serialize SET vs get_tph we could
have a mutex on the dma-buf structure, or use RCU if we want to manage
it locklessly and make sure get_tph always sees a fully consistent set
of values. Thanks,
Alex
> > };
> >
> > static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> > @@ -69,6 +86,36 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> > return ret;
> > }
> >
> > +static int vfio_pci_dma_buf_get_tph(struct dma_buf *dmabuf, u16 *steering_tag,
> > + u8 *ph, u8 st_width)
> > +{
> > + struct vfio_pci_dma_buf *priv = dmabuf->priv;
> > + u32 flags;
> > +
> > + /* Pair with the smp_store_release() in VFIO_DEVICE_FEATURE_DMA_BUF_TPH. */
> > + flags = smp_load_acquire(&priv->tph_flags);
> > + if (!flags)
> > + return -EOPNOTSUPP;
> > +
> > + switch (st_width) {
> > + case 8:
> > + if (!(flags & VFIO_DMA_BUF_TPH_ST))
> > + return -EOPNOTSUPP;
> > + *steering_tag = priv->steering_tag;
> > + break;
> > + case 16:
> > + if (!(flags & VFIO_DMA_BUF_TPH_ST_EXT))
> > + return -EOPNOTSUPP;
> > + *steering_tag = priv->steering_tag_ext;
> > + break;
> > + default:
> > + return -EINVAL;
> > + }
> > +
> > + *ph = priv->ph;
> > + return 0;
> > +}
> > +
> > static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> > struct sg_table *sgt,
> > enum dma_data_direction dir)
> > @@ -84,16 +131,17 @@ static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> > static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> > {
> > struct vfio_pci_dma_buf *priv = dmabuf->priv;
> > + struct vfio_pci_core_device *vdev = READ_ONCE(priv->vdev);
> >
> > /*
> > * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
> > * The refcount prevents both.
> > */
> > - if (priv->vdev) {
> > - down_write(&priv->vdev->memory_lock);
> > + if (vdev) {
> > + down_write(&vdev->memory_lock);
> > list_del_init(&priv->dmabufs_elm);
> > - up_write(&priv->vdev->memory_lock);
> > - vfio_device_put_registration(&priv->vdev->vdev);
> > + up_write(&vdev->memory_lock);
> > + vfio_device_put_registration(&vdev->vdev);
> > }
> > kfree(priv->phys_vec);
> > kfree(priv);
>
>
> This seems unnecessary. I think this is just because priv->vdev is now
> (unnecessarily) set via WRITE_ONCE, right? These are very well ordered
> paths, prior to exposing the dma-buf, while the device is opened, during
> release, after release. They don't seem to need the READ/WRITE_ONCE
> treatment. This looks like noise from trying to make it lockless.
>
>
> > @@ -101,6 +149,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> >
> > static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
> > .attach = vfio_pci_dma_buf_attach,
> > + .get_tph = vfio_pci_dma_buf_get_tph,
> > .map_dma_buf = vfio_pci_dma_buf_map,
> > .unmap_dma_buf = vfio_pci_dma_buf_unmap,
> > .release = vfio_pci_dma_buf_release,
> > @@ -269,7 +318,7 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> > goto err_free_priv;
> > }
> >
> > - priv->vdev = vdev;
> > + WRITE_ONCE(priv->vdev, vdev);
> > priv->nr_ranges = get_dma_buf.nr_ranges;
> > priv->size = length;
> > ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider,
> > @@ -331,6 +380,77 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> > return ret;
> > }
> >
> > +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> > + u32 flags,
> > + struct vfio_device_feature_dma_buf_tph __user *arg,
> > + size_t argsz)
> > +{
> > + struct vfio_device_feature_dma_buf_tph set_tph;
> > + struct vfio_pci_dma_buf *priv;
> > + struct dma_buf *dmabuf;
> > + int ret;
> > +
> > + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
> > + sizeof(set_tph));
> > + if (ret != 1)
> > + return ret;
> > +
> > + if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
> > + return -EFAULT;
> > +
> > + if (set_tph.flags & ~(VFIO_DMA_BUF_TPH_ST | VFIO_DMA_BUF_TPH_ST_EXT))
> > + return -EINVAL;
> > +
> > + if (!set_tph.flags)
> > + return -EINVAL;
> > +
> > + /* PCIe TLP Processing Hint is a 2-bit field. */
> > + if (set_tph.ph & ~0x3)
> > + return -EINVAL;
> > +
> > + dmabuf = dma_buf_get(set_tph.dmabuf_fd);
> > + if (IS_ERR(dmabuf))
> > + return PTR_ERR(dmabuf);
> > +
> > + if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
> > + ret = -EINVAL;
> > + goto out_put;
> > + }
> > +
> > + priv = dmabuf->priv;
> > + down_write(&vdev->memory_lock);
> > + if (READ_ONCE(priv->vdev) != vdev) {
> > + ret = -EINVAL;
> > + goto out_unlock;
> > + }
> > +
> > + /*
> > + * TPH metadata is write-once per dma-buf so that lockless readers only
> > + * have to observe a single release-published transition from 0 -> flags.
> > + */
> > + if (READ_ONCE(priv->tph_flags)) {
> > + ret = -EBUSY;
> > + goto out_unlock;
> > + }
> > +
> > + priv->steering_tag = set_tph.steering_tag;
> > + priv->steering_tag_ext = set_tph.steering_tag_ext;
> > + priv->ph = set_tph.ph;
> > + /*
> > + * Publish the TPH values before the gate flag, so that lockless
> > + * readers in vfio_pci_dma_buf_get_tph() see fully-initialized
> > + * fields once they observe a non-zero tph_flags.
> > + */
> > + smp_store_release(&priv->tph_flags, set_tph.flags);
> > + ret = 0;
> > +
> > +out_unlock:
> > + up_write(&vdev->memory_lock);
> > +out_put:
> > + dma_buf_put(dmabuf);
> > + return ret;
> > +}
> > +
> > void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> > {
> > struct vfio_pci_dma_buf *priv;
> > @@ -388,7 +508,7 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> >
> > dma_resv_lock(priv->dmabuf->resv, NULL);
> > list_del_init(&priv->dmabufs_elm);
> > - priv->vdev = NULL;
> > + WRITE_ONCE(priv->vdev, NULL);
> > priv->revoked = true;
> > dma_buf_invalidate_mappings(priv->dmabuf);
> > dma_resv_wait_timeout(priv->dmabuf->resv,
> > diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
> > index fca9d0dfac90..c58f369be4b3 100644
> > --- a/drivers/vfio/pci/vfio_pci_priv.h
> > +++ b/drivers/vfio/pci/vfio_pci_priv.h
> > @@ -118,6 +118,10 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
> > int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> > struct vfio_device_feature_dma_buf __user *arg,
> > size_t argsz);
> > +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> > + u32 flags,
> > + struct vfio_device_feature_dma_buf_tph __user *arg,
> > + size_t argsz);
> > void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
> > void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
> > #else
> > @@ -128,6 +132,14 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> > {
> > return -ENOTTY;
> > }
> > +
> > +static inline int
> > +vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev, u32 flags,
> > + struct vfio_device_feature_dma_buf_tph __user *arg,
> > + size_t argsz)
> > +{
> > + return -ENOTTY;
> > +}
> > static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> > {
> > }
> > diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> > index d1203da56fc5..49eb6ad644a2 100644
> > --- a/include/linux/dma-buf.h
> > +++ b/include/linux/dma-buf.h
> > @@ -113,6 +113,27 @@ struct dma_buf_ops {
> > */
> > void (*unpin)(struct dma_buf_attachment *attach);
> >
> > + /**
> > + * @get_tph:
> > + * @dmabuf: DMA buffer for which to retrieve TPH metadata
> > + * @steering_tag: Returns the raw TPH steering tag for @st_width
> > + * @ph: Returns the TPH processing hint (2-bit value)
> > + * @st_width: Consumer's supported steering tag width in bits (8 or 16)
> > + *
> > + * Return the TPH (TLP Processing Hints) metadata associated with this
> > + * DMA buffer for the requested steering-tag width. 8-bit ST and 16-bit
> > + * Extended ST are distinct namespaces in the PCIe TPH ST table and may
> > + * both be present with different values, so the exporter must select the
> > + * value that matches @st_width and must not substitute one for the other.
> > + *
> > + * Return 0 on success, -EOPNOTSUPP if no metadata is available for the
> > + * requested width, or -EINVAL if @st_width is not 8 or 16.
> > + *
> > + * This callback is optional.
> > + */
> > + int (*get_tph)(struct dma_buf *dmabuf, u16 *steering_tag, u8 *ph,
> > + u8 st_width);
> > +
> > /**
> > * @map_dma_buf:
> > *
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 5de618a3a5ee..a9cb6cbc6ade 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -1534,6 +1534,41 @@ struct vfio_device_feature_dma_buf {
> > */
> > #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
> >
> > +/**
> > + * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
> > + * with a vfio-exported dma-buf. The dma-buf must have been created by
> > + * VFIO_DEVICE_FEATURE_DMA_BUF on this device.
> > + *
> > + * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
> > + *
> > + * 8-bit ST (steering_tag) and 16-bit Extended ST (steering_tag_ext) are
> > + * distinct namespaces in the PCIe TPH ST table and may both be present with
> > + * different values. Userspace should populate the value(s) it has from the
> > + * firmware ST table for this device and set the matching VFIO_DMA_BUF_TPH_ST /
> > + * VFIO_DMA_BUF_TPH_ST_EXT bit in @flags. An importer requests a specific
> > + * width and receives the matching value; if the requested width is not
> > + * present, the importer is told TPH is unavailable for this dma-buf.
> > + *
> > + * ph is the 2-bit TLP Processing Hint and must be in the range [0, 3].
> > + *
> > + * The user must set TPH on the dma-buf before the importer consumes it.
> > + * TPH metadata is write-once per dma-buf; a second SET returns -EBUSY.
> > + *
> > + * Return: 0 on success, -errno on failure.
> > + */
> > +#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13
> > +
> > +#define VFIO_DMA_BUF_TPH_ST (1 << 0) /* steering_tag valid */
> > +#define VFIO_DMA_BUF_TPH_ST_EXT (1 << 1) /* steering_tag_ext valid */
> > +
> > +struct vfio_device_feature_dma_buf_tph {
> > + __s32 dmabuf_fd;
> > + __u32 flags;
> > + __u8 steering_tag;
> > + __u8 ph;
> > + __u16 steering_tag_ext;
> > +};
>
> Sure is tempting to make the ph field the first 2-bits of u8 flags.
> Thanks,
>
> Alex
>
> > +
> > /* -------- API for Type1 VFIO IOMMU -------- */
> >
> > /**
>
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v4 1/3] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature
2026-05-21 22:24 ` Alex Williamson
@ 2026-05-23 1:03 ` Zhiping Zhang
0 siblings, 0 replies; 14+ messages in thread
From: Zhiping Zhang @ 2026-05-23 1:03 UTC (permalink / raw)
To: Alex Williamson
Cc: Jason Gunthorpe, Leon Romanovsky, Bjorn Helgaas, kvm, linux-rdma,
linux-pci, netdev, dri-devel, Keith Busch, Yochai Cohen,
Yishai Hadas
On Thu, May 21, 2026 at 3:24 PM Alex Williamson <alex@shazbot.org> wrote:
>
> >
> On Thu, 21 May 2026 16:04:12 -0600
> Alex Williamson <alex@shazbot.org> wrote:
>
> > On Tue, 19 May 2026 13:13:49 -0700
> > Zhiping Zhang <zhipingz@meta.com> wrote:
> >
> > > Add a dma-buf get_tph callback for exporters to return TPH
> > > (TLP Processing Hints) metadata, and add VFIO_DEVICE_FEATURE_DMA_BUF_TPH
> > > so userspace can attach that metadata to a VFIO-exported dma-buf.
> >
> > This should be two patches, the first extending the dma-buf framework
> > for the get_tph callback for explicit approval from dma-buf maintainers
> > (who are not even copied here). The second the vfio-pci implementation
> > of get_tph.
> >
> > > 8-bit ST and 16-bit Extended ST are distinct PCIe TPH namespaces; the
> > > uAPI carries both with explicit validity flags so importers get the
> > > value matching their requested width. SET is write-once per dma-buf;
> > > the existing VFIO_DEVICE_FEATURE_DMA_BUF uAPI is unchanged.
> >
> > I didn't see what motivated this write-once change, I thought we
> > understood that it was a userspace problem that the tph values need to
> > be set before providing the dma-buf fd to the importer and that races
> > relative to that are a userspace ordering problem. Write-once seems
> > unnecessarily restrictive and there's no justification provided here.
> >
> > > Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
> > > ---
> > > drivers/vfio/pci/vfio_pci_core.c | 3 +
> > > drivers/vfio/pci/vfio_pci_dmabuf.c | 134 +++++++++++++++++++++++++++--
> > > drivers/vfio/pci/vfio_pci_priv.h | 12 +++
> > > include/linux/dma-buf.h | 21 +++++
> > > include/uapi/linux/vfio.h | 35 ++++++++
> > > 5 files changed, 198 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > > index 3f8d093aacf8..94aa6dd95701 100644
> > > --- a/drivers/vfio/pci/vfio_pci_core.c
> > > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > > @@ -1534,6 +1534,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> > > return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> > > case VFIO_DEVICE_FEATURE_DMA_BUF:
> > > return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
> > > + case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
> > > + return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
> > > + argsz);
> > > default:
> > > return -ENOTTY;
> > > }
> > > diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > > index f87fd32e4a01..be1c65385670 100644
> > > --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> > > +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > > @@ -19,7 +19,24 @@ struct vfio_pci_dma_buf {
> > > u32 nr_ranges;
> > > struct kref kref;
> > > struct completion comp;
> > > - u8 revoked : 1;
> > > + /*
> > > + * TPH metadata published by VFIO_DEVICE_FEATURE_DMA_BUF_TPH and
> > > + * consumed by the @get_tph dma-buf callback.
> > > + *
> > > + * @tph_flags is the publish/consume gate: writers populate
> > > + * @steering_tag, @steering_tag_ext and @ph first, then store
> > > + * @tph_flags with smp_store_release(); readers do
> > > + * smp_load_acquire(&tph_flags) before accessing the value fields.
> > > + * @tph_flags == 0 means "TPH not set". Writers publish a non-zero
> > > + * value only once per dma-buf and serialize via vdev->memory_lock;
> > > + * readers stay lockless to avoid AB-BA against the dma_resv_lock held
> > > + * by importers.
> > > + */
> >
> > Can you outline the ABBA hazard, I'm not seeing it. You're acquiring
> > memory_lock in the feature SET and dma_resv_lock doesn't appear to be
> > held when calling .get_tph(). There's a lot of lockless complication
> > here balanced on this claim of avoiding a hazard that doesn't appear
> > present.
> >
> > > + u32 tph_flags;
> > > + u16 steering_tag_ext;
> > > + u8 steering_tag;
> > > + u8 ph;
> > > + bool revoked;
> >
> > If we still used memory_lock for tph, these could be:
> >
> > u8 tph_st_valid:1; /* memory_lock */
> > u8 tph_st_ext_valid:1; /* memory_lock */
> > u8 tph_ph:2; /* memory_lock */
> > u8 tph_st;
> > u16 tph_st_ext;
> > u8 revoked:1; /* dma_resv_lock */
> >
> > The existing change of @revoked from bitfield to bool has no rationale
> > noted for it in the commit log.
>
> On second thought, what dependency does anything here have on
> memory_lock? I think we're jumping through hoops to avoid a lock we
> don't even need. If we just want to serialize SET vs get_tph we could
> have a mutex on the dma-buf structure, or use RCU if we want to manage
> it locklessly and make sure get_tph always sees a fully consistent set
> of values. Thanks,
>
> Alex
Agreed, we don't need memory_lock in this path. For v5 I'll instead add a
struct mutex lock to struct vfio_pci_dma_buf and take it in SET,
get_tph,
and around the priv->vdev = NULL store in cleanup.
Thanks,
Zhiping
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v4 1/3] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature
2026-05-21 22:04 ` Alex Williamson
2026-05-21 22:24 ` Alex Williamson
@ 2026-05-22 23:53 ` Zhiping Zhang
1 sibling, 0 replies; 14+ messages in thread
From: Zhiping Zhang @ 2026-05-22 23:53 UTC (permalink / raw)
To: Alex Williamson
Cc: Jason Gunthorpe, Leon Romanovsky, Bjorn Helgaas, kvm, linux-rdma,
linux-pci, netdev, dri-devel, Keith Busch, Yochai Cohen,
Yishai Hadas
On Thu, May 21, 2026 at 3:04 PM Alex Williamson <alex@shazbot.org> wrote:
>
> >
> On Tue, 19 May 2026 13:13:49 -0700
> Zhiping Zhang <zhipingz@meta.com> wrote:
>
> > Add a dma-buf get_tph callback for exporters to return TPH
> > (TLP Processing Hints) metadata, and add VFIO_DEVICE_FEATURE_DMA_BUF_TPH
> > so userspace can attach that metadata to a VFIO-exported dma-buf.
>
> This should be two patches, the first extending the dma-buf framework
> for the get_tph callback for explicit approval from dma-buf maintainers
> (who are not even copied here). The second the vfio-pci implementation
> of get_tph.
Agreed, let me split. v5 will have:
1/2 dma-buf: add optional get_tph() callback
2/2 vfio/pci: implement get_tph and VFIO_DEVICE_FEATURE_DMA_BUF_TPH
I will also add Sumit Semwal and Christian König, the dma-buf maintainers.
>
> > 8-bit ST and 16-bit Extended ST are distinct PCIe TPH namespaces; the
> > uAPI carries both with explicit validity flags so importers get the
> > value matching their requested width. SET is write-once per dma-buf;
> > the existing VFIO_DEVICE_FEATURE_DMA_BUF uAPI is unchanged.
>
> I didn't see what motivated this write-once change, I thought we
> understood that it was a userspace problem that the tph values need to
> be set before providing the dma-buf fd to the importer and that races
> relative to that are a userspace ordering problem. Write-once seems
> unnecessarily restrictive and there's no justification provided here.
Got it, yes the "set TPH before handing the fd to the importer" contract is a
userspace ordering problem. I'll drop write-once. I'll allow SET to
overwrite and
document the ordering requirement in the uAPI comment instead.
>
> > Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
> > ---
> > drivers/vfio/pci/vfio_pci_core.c | 3 +
> > drivers/vfio/pci/vfio_pci_dmabuf.c | 134 +++++++++++++++++++++++++++--
> > drivers/vfio/pci/vfio_pci_priv.h | 12 +++
> > include/linux/dma-buf.h | 21 +++++
> > include/uapi/linux/vfio.h | 35 ++++++++
> > 5 files changed, 198 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 3f8d093aacf8..94aa6dd95701 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -1534,6 +1534,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
> > return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> > case VFIO_DEVICE_FEATURE_DMA_BUF:
> > return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
> > + case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
> > + return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
> > + argsz);
> > default:
> > return -ENOTTY;
> > }
> > diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > index f87fd32e4a01..be1c65385670 100644
> > --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> > +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > @@ -19,7 +19,24 @@ struct vfio_pci_dma_buf {
> > u32 nr_ranges;
> > struct kref kref;
> > struct completion comp;
> > - u8 revoked : 1;
> > + /*
> > + * TPH metadata published by VFIO_DEVICE_FEATURE_DMA_BUF_TPH and
> > + * consumed by the @get_tph dma-buf callback.
> > + *
> > + * @tph_flags is the publish/consume gate: writers populate
> > + * @steering_tag, @steering_tag_ext and @ph first, then store
> > + * @tph_flags with smp_store_release(); readers do
> > + * smp_load_acquire(&tph_flags) before accessing the value fields.
> > + * @tph_flags == 0 means "TPH not set". Writers publish a non-zero
> > + * value only once per dma-buf and serialize via vdev->memory_lock;
> > + * readers stay lockless to avoid AB-BA against the dma_resv_lock held
> > + * by importers.
> > + */
>
> Can you outline the ABBA hazard, I'm not seeing it. You're acquiring
> memory_lock in the feature SET and dma_resv_lock doesn't appear to be
> held when calling .get_tph(). There's a lot of lockless complication
> here balanced on this claim of avoiding a hazard that doesn't appear
> present.
You're right: the release/acquire scheme is solving a problem that
doesn't exist.
v5 will drop it; see the reply to your follow-up for the replacement.
>
> > + u32 tph_flags;
> > + u16 steering_tag_ext;
> > + u8 steering_tag;
> > + u8 ph;
> > + bool revoked;
>
> If we still used memory_lock for tph, these could be:
>
> u8 tph_st_valid:1; /* memory_lock */
> u8 tph_st_ext_valid:1; /* memory_lock */
> u8 tph_ph:2; /* memory_lock */
> u8 tph_st;
> u16 tph_st_ext;
> u8 revoked:1; /* dma_resv_lock */
>
> The existing change of @revoked from bitfield to bool has no rationale
> noted for it in the commit log.
Will adopt the bitfield layout you suggested in v5, with the lock annotations.
>
> > };
> >
> > static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> > @@ -69,6 +86,36 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> > return ret;
> > }
> >
> > +static int vfio_pci_dma_buf_get_tph(struct dma_buf *dmabuf, u16 *steering_tag,
> > + u8 *ph, u8 st_width)
> > +{
> > + struct vfio_pci_dma_buf *priv = dmabuf->priv;
> > + u32 flags;
> > +
> > + /* Pair with the smp_store_release() in VFIO_DEVICE_FEATURE_DMA_BUF_TPH. */
> > + flags = smp_load_acquire(&priv->tph_flags);
> > + if (!flags)
> > + return -EOPNOTSUPP;
> > +
> > + switch (st_width) {
> > + case 8:
> > + if (!(flags & VFIO_DMA_BUF_TPH_ST))
> > + return -EOPNOTSUPP;
> > + *steering_tag = priv->steering_tag;
> > + break;
> > + case 16:
> > + if (!(flags & VFIO_DMA_BUF_TPH_ST_EXT))
> > + return -EOPNOTSUPP;
> > + *steering_tag = priv->steering_tag_ext;
> > + break;
> > + default:
> > + return -EINVAL;
> > + }
> > +
> > + *ph = priv->ph;
> > + return 0;
> > +}
> > +
> > static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> > struct sg_table *sgt,
> > enum dma_data_direction dir)
> > @@ -84,16 +131,17 @@ static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> > static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> > {
> > struct vfio_pci_dma_buf *priv = dmabuf->priv;
> > + struct vfio_pci_core_device *vdev = READ_ONCE(priv->vdev);
> >
> > /*
> > * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
> > * The refcount prevents both.
> > */
> > - if (priv->vdev) {
> > - down_write(&priv->vdev->memory_lock);
> > + if (vdev) {
> > + down_write(&vdev->memory_lock);
> > list_del_init(&priv->dmabufs_elm);
> > - up_write(&priv->vdev->memory_lock);
> > - vfio_device_put_registration(&priv->vdev->vdev);
> > + up_write(&vdev->memory_lock);
> > + vfio_device_put_registration(&vdev->vdev);
> > }
> > kfree(priv->phys_vec);
> > kfree(priv);
>
>
> This seems unnecessary. I think this is just because priv->vdev is now
> (unnecessarily) set via WRITE_ONCE, right? These are very well ordered
> paths, prior to exposing the dma-buf, while the device is opened, during
> release, after release. They don't seem to need the READ/WRITE_ONCE
> treatment. This looks like noise from trying to make it lockless.
Got it, this is fallout from the lockless attempt. priv->vdev
transitions are already
well-ordered by memory_lock. I'll drop all the READ_ONCE/WRITE_ONCE on
priv->vdev in v5 and leave the existing accesses as they were.
>
>
> > @@ -101,6 +149,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> >
> > static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
> > .attach = vfio_pci_dma_buf_attach,
> > + .get_tph = vfio_pci_dma_buf_get_tph,
> > .map_dma_buf = vfio_pci_dma_buf_map,
> > .unmap_dma_buf = vfio_pci_dma_buf_unmap,
> > .release = vfio_pci_dma_buf_release,
> > @@ -269,7 +318,7 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> > goto err_free_priv;
> > }
> >
> > - priv->vdev = vdev;
> > + WRITE_ONCE(priv->vdev, vdev);
> > priv->nr_ranges = get_dma_buf.nr_ranges;
> > priv->size = length;
> > ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider,
> > @@ -331,6 +380,77 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> > return ret;
> > }
> >
> > +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> > + u32 flags,
> > + struct vfio_device_feature_dma_buf_tph __user *arg,
> > + size_t argsz)
> > +{
> > + struct vfio_device_feature_dma_buf_tph set_tph;
> > + struct vfio_pci_dma_buf *priv;
> > + struct dma_buf *dmabuf;
> > + int ret;
> > +
> > + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
> > + sizeof(set_tph));
> > + if (ret != 1)
> > + return ret;
> > +
> > + if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
> > + return -EFAULT;
> > +
> > + if (set_tph.flags & ~(VFIO_DMA_BUF_TPH_ST | VFIO_DMA_BUF_TPH_ST_EXT))
> > + return -EINVAL;
> > +
> > + if (!set_tph.flags)
> > + return -EINVAL;
> > +
> > + /* PCIe TLP Processing Hint is a 2-bit field. */
> > + if (set_tph.ph & ~0x3)
> > + return -EINVAL;
> > +
> > + dmabuf = dma_buf_get(set_tph.dmabuf_fd);
> > + if (IS_ERR(dmabuf))
> > + return PTR_ERR(dmabuf);
> > +
> > + if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
> > + ret = -EINVAL;
> > + goto out_put;
> > + }
> > +
> > + priv = dmabuf->priv;
> > + down_write(&vdev->memory_lock);
> > + if (READ_ONCE(priv->vdev) != vdev) {
> > + ret = -EINVAL;
> > + goto out_unlock;
> > + }
> > +
> > + /*
> > + * TPH metadata is write-once per dma-buf so that lockless readers only
> > + * have to observe a single release-published transition from 0 -> flags.
> > + */
> > + if (READ_ONCE(priv->tph_flags)) {
> > + ret = -EBUSY;
> > + goto out_unlock;
> > + }
> > +
> > + priv->steering_tag = set_tph.steering_tag;
> > + priv->steering_tag_ext = set_tph.steering_tag_ext;
> > + priv->ph = set_tph.ph;
> > + /*
> > + * Publish the TPH values before the gate flag, so that lockless
> > + * readers in vfio_pci_dma_buf_get_tph() see fully-initialized
> > + * fields once they observe a non-zero tph_flags.
> > + */
> > + smp_store_release(&priv->tph_flags, set_tph.flags);
> > + ret = 0;
> > +
> > +out_unlock:
> > + up_write(&vdev->memory_lock);
> > +out_put:
> > + dma_buf_put(dmabuf);
> > + return ret;
> > +}
> > +
> > void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> > {
> > struct vfio_pci_dma_buf *priv;
> > @@ -388,7 +508,7 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> >
> > dma_resv_lock(priv->dmabuf->resv, NULL);
> > list_del_init(&priv->dmabufs_elm);
> > - priv->vdev = NULL;
> > + WRITE_ONCE(priv->vdev, NULL);
> > priv->revoked = true;
> > dma_buf_invalidate_mappings(priv->dmabuf);
> > dma_resv_wait_timeout(priv->dmabuf->resv,
> > diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
> > index fca9d0dfac90..c58f369be4b3 100644
> > --- a/drivers/vfio/pci/vfio_pci_priv.h
> > +++ b/drivers/vfio/pci/vfio_pci_priv.h
> > @@ -118,6 +118,10 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
> > int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> > struct vfio_device_feature_dma_buf __user *arg,
> > size_t argsz);
> > +int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
> > + u32 flags,
> > + struct vfio_device_feature_dma_buf_tph __user *arg,
> > + size_t argsz);
> > void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
> > void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
> > #else
> > @@ -128,6 +132,14 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> > {
> > return -ENOTTY;
> > }
> > +
> > +static inline int
> > +vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev, u32 flags,
> > + struct vfio_device_feature_dma_buf_tph __user *arg,
> > + size_t argsz)
> > +{
> > + return -ENOTTY;
> > +}
> > static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> > {
> > }
> > diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> > index d1203da56fc5..49eb6ad644a2 100644
> > --- a/include/linux/dma-buf.h
> > +++ b/include/linux/dma-buf.h
> > @@ -113,6 +113,27 @@ struct dma_buf_ops {
> > */
> > void (*unpin)(struct dma_buf_attachment *attach);
> >
> > + /**
> > + * @get_tph:
> > + * @dmabuf: DMA buffer for which to retrieve TPH metadata
> > + * @steering_tag: Returns the raw TPH steering tag for @st_width
> > + * @ph: Returns the TPH processing hint (2-bit value)
> > + * @st_width: Consumer's supported steering tag width in bits (8 or 16)
> > + *
> > + * Return the TPH (TLP Processing Hints) metadata associated with this
> > + * DMA buffer for the requested steering-tag width. 8-bit ST and 16-bit
> > + * Extended ST are distinct namespaces in the PCIe TPH ST table and may
> > + * both be present with different values, so the exporter must select the
> > + * value that matches @st_width and must not substitute one for the other.
> > + *
> > + * Return 0 on success, -EOPNOTSUPP if no metadata is available for the
> > + * requested width, or -EINVAL if @st_width is not 8 or 16.
> > + *
> > + * This callback is optional.
> > + */
> > + int (*get_tph)(struct dma_buf *dmabuf, u16 *steering_tag, u8 *ph,
> > + u8 st_width);
> > +
> > /**
> > * @map_dma_buf:
> > *
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 5de618a3a5ee..a9cb6cbc6ade 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -1534,6 +1534,41 @@ struct vfio_device_feature_dma_buf {
> > */
> > #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
> >
> > +/**
> > + * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
> > + * with a vfio-exported dma-buf. The dma-buf must have been created by
> > + * VFIO_DEVICE_FEATURE_DMA_BUF on this device.
> > + *
> > + * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
> > + *
> > + * 8-bit ST (steering_tag) and 16-bit Extended ST (steering_tag_ext) are
> > + * distinct namespaces in the PCIe TPH ST table and may both be present with
> > + * different values. Userspace should populate the value(s) it has from the
> > + * firmware ST table for this device and set the matching VFIO_DMA_BUF_TPH_ST /
> > + * VFIO_DMA_BUF_TPH_ST_EXT bit in @flags. An importer requests a specific
> > + * width and receives the matching value; if the requested width is not
> > + * present, the importer is told TPH is unavailable for this dma-buf.
> > + *
> > + * ph is the 2-bit TLP Processing Hint and must be in the range [0, 3].
> > + *
> > + * The user must set TPH on the dma-buf before the importer consumes it.
> > + * TPH metadata is write-once per dma-buf; a second SET returns -EBUSY.
> > + *
> > + * Return: 0 on success, -errno on failure.
> > + */
> > +#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13
> > +
> > +#define VFIO_DMA_BUF_TPH_ST (1 << 0) /* steering_tag valid */
> > +#define VFIO_DMA_BUF_TPH_ST_EXT (1 << 1) /* steering_tag_ext valid */
> > +
> > +struct vfio_device_feature_dma_buf_tph {
> > + __s32 dmabuf_fd;
> > + __u32 flags;
> > + __u8 steering_tag;
> > + __u8 ph;
> > + __u16 steering_tag_ext;
> > +};
>
> Sure is tempting to make the ph field the first 2-bits of u8 flags.
I went back and worked through the layout both ways and I'd actually
like to keep ph as
its own field. I think the separate ph field reads better and costs nothing.
> Thanks,
>
> Alex
Thanks,
Zhiping
^ permalink raw reply [flat|nested] 14+ messages in thread
* Claude review: vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature
2026-05-19 20:13 ` [PATCH v4 1/3] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature Zhiping Zhang
2026-05-21 22:04 ` Alex Williamson
@ 2026-05-25 12:27 ` Claude Code Review Bot
1 sibling, 0 replies; 14+ messages in thread
From: Claude Code Review Bot @ 2026-05-25 12:27 UTC (permalink / raw)
To: dri-devel-reviews
Patch Review
**dma-buf callback design (include/linux/dma-buf.h)**
The `get_tph` callback API is clean. The documentation clearly specifies that 8-bit and 16-bit ST are distinct namespaces. One observation:
```c
int (*get_tph)(struct dma_buf *dmabuf, u16 *steering_tag, u8 *ph,
u8 st_width);
```
The `steering_tag` output is `u16 *` but for `st_width == 8` the caller only expects an 8-bit value. This is fine since the VFIO implementation stores `u8 steering_tag` and will only write the low byte, but it means the implementation must be careful. Looking at the actual implementation:
```c
case 8:
if (!(flags & VFIO_DMA_BUF_TPH_ST))
return -EOPNOTSUPP;
*steering_tag = priv->steering_tag;
break;
case 16:
if (!(flags & VFIO_DMA_BUF_TPH_ST_EXT))
return -EOPNOTSUPP;
*steering_tag = priv->steering_tag_ext;
break;
```
This correctly returns the right-width value via the `u16` pointer. Good.
**Write-once semantics and memory ordering (vfio_pci_dmabuf.c)**
The release/acquire pair is correctly used:
```c
/* Writer: */
priv->steering_tag = set_tph.steering_tag;
priv->steering_tag_ext = set_tph.steering_tag_ext;
priv->ph = set_tph.ph;
smp_store_release(&priv->tph_flags, set_tph.flags);
/* Reader: */
flags = smp_load_acquire(&priv->tph_flags);
```
The write-once enforcement via `READ_ONCE(priv->tph_flags)` check returning `-EBUSY` under `memory_lock` is correct.
**READ_ONCE/WRITE_ONCE on priv->vdev**
The conversion of plain accesses to `READ_ONCE`/`WRITE_ONCE` for `priv->vdev` is a reasonable annotation improvement, but there's a subtlety: in `vfio_pci_dma_buf_release`, the code does:
```c
struct vfio_pci_core_device *vdev = READ_ONCE(priv->vdev);
if (vdev) {
down_write(&vdev->memory_lock);
```
The original code didn't use `READ_ONCE` here either, and this path races with `vfio_pci_dma_buf_cleanup` which does `WRITE_ONCE(priv->vdev, NULL)` under `dma_resv_lock`. The `READ_ONCE` prevents compiler tearing/re-reading but doesn't add synchronization. The existing `kref` protects against double-cleanup, so this is still correct — just worth noting that `READ_ONCE`/`WRITE_ONCE` here are documentation of the existing design rather than a fix.
**uAPI struct (include/uapi/linux/vfio.h)**
```c
struct vfio_device_feature_dma_buf_tph {
__s32 dmabuf_fd;
__u32 flags;
__u8 steering_tag;
__u8 ph;
__u16 steering_tag_ext;
};
```
This is 12 bytes total with natural alignment (no holes). Feature number 13 is correctly the next available. The struct has no padding or reserved fields — that's fine for a write-only SET-only feature, but if GET support were ever desired, there'd be no room for extension. Given the write-once semantics, this seems intentional.
**Minor**: The `revoked` field changed from `u8 revoked : 1` to `bool revoked`. This is a clean-up unrelated to TPH — consider splitting it out or at least mentioning it in the commit message.
---
Generated by Claude Code Patch Reviewer
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v4 2/3] PCI/TPH: expose the enabled TPH requester type
2026-05-19 20:13 [PATCH v4 0/3] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
2026-05-19 20:13 ` [PATCH v4 1/3] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature Zhiping Zhang
@ 2026-05-19 20:13 ` Zhiping Zhang
2026-05-25 12:27 ` Claude review: " Claude Code Review Bot
2026-05-19 20:13 ` [PATCH v4 3/3] RDMA/mlx5: get tph for p2p access when registering dma-buf mr Zhiping Zhang
2026-05-25 12:27 ` Claude review: vfio/dma-buf: add TPH support for peer-to-peer access Claude Code Review Bot
3 siblings, 1 reply; 14+ messages in thread
From: Zhiping Zhang @ 2026-05-19 20:13 UTC (permalink / raw)
To: Alex Williamson, Jason Gunthorpe, Leon Romanovsky
Cc: Bjorn Helgaas, kvm, linux-rdma, linux-pci, netdev, dri-devel,
Keith Busch, Yochai Cohen, Yishai Hadas, Zhiping Zhang
Add pcie_tph_enabled_req_type() so drivers can query the enabled TPH
requester mode without reaching into pci_dev internals. A
!CONFIG_PCIE_TPH stub returns PCI_TPH_REQ_DISABLE so callers need no
ifdef.
Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
---
drivers/pci/tph.c | 12 ++++++++++++
include/linux/pci-tph.h | 2 ++
2 files changed, 14 insertions(+)
diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c
index 91145e8d9d95..6c4492075ae9 100644
--- a/drivers/pci/tph.c
+++ b/drivers/pci/tph.c
@@ -174,6 +174,18 @@ u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev)
}
EXPORT_SYMBOL(pcie_tph_get_st_table_loc);
+/**
+ * pcie_tph_enabled_req_type - Return the device's enabled TPH requester type
+ * @pdev: PCI device to query
+ *
+ * Return: PCI_TPH_REQ_DISABLE, PCI_TPH_REQ_TPH_ONLY or PCI_TPH_REQ_EXT_TPH.
+ */
+u8 pcie_tph_enabled_req_type(struct pci_dev *pdev)
+{
+ return pdev->tph_req_type;
+}
+EXPORT_SYMBOL(pcie_tph_enabled_req_type);
+
/*
* Return the size of ST table. If ST table is not in TPH Requester Extended
* Capability space, return 0. Otherwise return the ST Table Size + 1.
diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h
index be68cd17f2f8..fe572737b409 100644
--- a/include/linux/pci-tph.h
+++ b/include/linux/pci-tph.h
@@ -30,6 +30,7 @@ void pcie_disable_tph(struct pci_dev *pdev);
int pcie_enable_tph(struct pci_dev *pdev, int mode);
u16 pcie_tph_get_st_table_size(struct pci_dev *pdev);
u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev);
+u8 pcie_tph_enabled_req_type(struct pci_dev *pdev);
#else
static inline int pcie_tph_set_st_entry(struct pci_dev *pdev,
unsigned int index, u16 tag)
@@ -41,6 +42,7 @@ static inline int pcie_tph_get_cpu_st(struct pci_dev *dev,
static inline void pcie_disable_tph(struct pci_dev *pdev) { }
static inline int pcie_enable_tph(struct pci_dev *pdev, int mode)
{ return -EINVAL; }
+static inline u8 pcie_tph_enabled_req_type(struct pci_dev *pdev) { return 0; }
#endif
#endif /* LINUX_PCI_TPH_H */
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 14+ messages in thread* Claude review: PCI/TPH: expose the enabled TPH requester type
2026-05-19 20:13 ` [PATCH v4 2/3] PCI/TPH: expose the enabled TPH requester type Zhiping Zhang
@ 2026-05-25 12:27 ` Claude Code Review Bot
0 siblings, 0 replies; 14+ messages in thread
From: Claude Code Review Bot @ 2026-05-25 12:27 UTC (permalink / raw)
To: dri-devel-reviews
Patch Review
This is a straightforward accessor:
```c
u8 pcie_tph_enabled_req_type(struct pci_dev *pdev)
{
return pdev->tph_req_type;
}
EXPORT_SYMBOL(pcie_tph_enabled_req_type);
```
The `!CONFIG_PCIE_TPH` stub returns 0, which matches `PCI_TPH_REQ_DISABLE` (defined as `0x0` in `pci_regs.h`):
```c
static inline u8 pcie_tph_enabled_req_type(struct pci_dev *pdev) { return 0; }
```
Clean and correct. The `EXPORT_SYMBOL` (not `_GPL`) matches the existing `pcie_tph_get_st_table_loc` export convention.
**Nit**: The kdoc says "Return: PCI_TPH_REQ_DISABLE, PCI_TPH_REQ_TPH_ONLY or PCI_TPH_REQ_EXT_TPH" — note that `PCI_TPH_REQ_EXT_TPH` is `0x3` (not `0x2`), so value `0x2` is a gap. The function can technically return any value stored in `tph_req_type`, but the current `pcie_enable_tph` only sets these three. Fine as-is.
---
Generated by Claude Code Patch Reviewer
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v4 3/3] RDMA/mlx5: get tph for p2p access when registering dma-buf mr
2026-05-19 20:13 [PATCH v4 0/3] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
2026-05-19 20:13 ` [PATCH v4 1/3] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature Zhiping Zhang
2026-05-19 20:13 ` [PATCH v4 2/3] PCI/TPH: expose the enabled TPH requester type Zhiping Zhang
@ 2026-05-19 20:13 ` Zhiping Zhang
2026-05-25 12:27 ` Claude review: " Claude Code Review Bot
2026-05-25 12:27 ` Claude review: vfio/dma-buf: add TPH support for peer-to-peer access Claude Code Review Bot
3 siblings, 1 reply; 14+ messages in thread
From: Zhiping Zhang @ 2026-05-19 20:13 UTC (permalink / raw)
To: Alex Williamson, Jason Gunthorpe, Leon Romanovsky
Cc: Bjorn Helgaas, kvm, linux-rdma, linux-pci, netdev, dri-devel,
Keith Busch, Yochai Cohen, Yishai Hadas, Zhiping Zhang
Query dma-buf TPH metadata when registering a dma-buf MR for
peer-to-peer access and translate the returned steering tag into an
mlx5 ST index. The DMAH path keeps priority; dma-buf metadata is the
fallback when no DMAH is supplied.
Track per-MR ownership of the allocated ST index and release it on MR
setup failure, destroy, and before re-entering the FRMR pool. Free
mlx5_st_idx_data when its refcount reaches zero to fix a pre-existing
leak in mlx5_st_dealloc_index().
Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
---
drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 ++
drivers/infiniband/hw/mlx5/mr.c | 86 ++++++++++++++++++-
.../net/ethernet/mellanox/mlx5/core/lib/st.c | 28 ++++--
include/linux/mlx5/driver.h | 7 ++
4 files changed, 115 insertions(+), 12 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index e156dc4d7529..4ab867392267 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -721,6 +721,12 @@ struct mlx5_ib_mr {
u8 revoked :1;
/* Indicates previous dmabuf page fault occurred */
u8 dmabuf_faulted:1;
+ /* Set when the MR owns dmabuf_st_index and must
+ * release it via mlx5_st_dealloc_index() once the
+ * firmware mkey is no longer referencing it.
+ */
+ u8 dmabuf_st_owned:1;
+ u16 dmabuf_st_index;
struct mlx5_ib_mkey null_mmkey;
};
};
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 3b6da45061a5..8059b5e4da97 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -38,6 +38,7 @@
#include <linux/delay.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
+#include <linux/pci-tph.h>
#include <rdma/frmr_pools.h>
#include <rdma/ib_umem_odp.h>
#include "dm.h"
@@ -46,6 +47,8 @@
#include "data_direct.h"
#include "dmah.h"
+MODULE_IMPORT_NS("DMA_BUF");
+
static int mkey_max_umr_order(struct mlx5_ib_dev *dev)
{
if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
@@ -899,6 +902,63 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
.invalidate_mappings = mlx5_ib_dmabuf_invalidate_cb,
};
+/*
+ * Query TPH metadata from @dmabuf and translate the raw steering tag into
+ * an mlx5 ST index. On success, returns 0 and the caller becomes the
+ * owner of *@st_index (must be released with mlx5_st_dealloc_index()
+ * once the firmware mkey no longer references it). On any failure
+ * *@st_index and *@ph are left as the no-TPH defaults set by the caller.
+ *
+ * @dmabuf must already be referenced by the caller (e.g. via the umem's
+ * attachment) so we don't re-resolve the user's fd here and avoid a
+ * dup2() TOCTOU between umem creation and TPH lookup.
+ */
+static void get_tph_mr_dmabuf(struct mlx5_ib_dev *dev, struct dma_buf *dmabuf,
+ u16 *st_index, u8 *ph)
+{
+ u8 req_type;
+ u16 steering_tag;
+ u8 st_width;
+ int ret;
+
+ if (!dmabuf->ops->get_tph)
+ return;
+
+ req_type = pcie_tph_enabled_req_type(dev->mdev->pdev);
+ switch (req_type) {
+ case PCI_TPH_REQ_TPH_ONLY:
+ st_width = 8;
+ break;
+ case PCI_TPH_REQ_EXT_TPH:
+ st_width = 16;
+ break;
+ default:
+ return;
+ }
+
+ ret = dmabuf->ops->get_tph(dmabuf, &steering_tag, ph, st_width);
+ if (ret) {
+ mlx5_ib_dbg(dev, "get_tph failed (%d)\n", ret);
+ *ph = MLX5_IB_NO_PH;
+ return;
+ }
+
+ ret = mlx5_st_alloc_index_by_tag(dev->mdev, steering_tag, st_index);
+ if (ret) {
+ *ph = MLX5_IB_NO_PH;
+ mlx5_ib_dbg(dev, "st_alloc_index_by_tag failed (%d)\n", ret);
+ }
+}
+
+static void mlx5_ib_mr_put_dmabuf_st(struct mlx5_ib_mr *mr)
+{
+ if (mr->umem && mr->dmabuf_st_owned) {
+ mlx5_st_dealloc_index(mr_to_mdev(mr)->mdev,
+ mr->dmabuf_st_index);
+ mr->dmabuf_st_owned = 0;
+ }
+}
+
static struct ib_mr *
reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
u64 offset, u64 length, u64 virt_addr,
@@ -941,16 +1001,26 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
ph = dmah->ph;
if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
st_index = mdmah->st_index;
+ } else {
+ get_tph_mr_dmabuf(dev, umem_dmabuf->attach->dmabuf,
+ &st_index, &ph);
}
mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
access_flags, access_mode,
st_index, ph);
if (IS_ERR(mr)) {
+ if (!dmah && st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+ mlx5_st_dealloc_index(dev->mdev, st_index);
ib_umem_release(&umem_dmabuf->umem);
return ERR_CAST(mr);
}
+ if (!dmah && st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) {
+ mr->dmabuf_st_index = st_index;
+ mr->dmabuf_st_owned = 1;
+ }
+
mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
@@ -1377,9 +1447,17 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
bool is_odp = is_odp_mr(mr);
int ret;
- if (mr->ibmr.frmr.pool && !mlx5_umr_revoke_mr_with_lock(mr) &&
- !ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr))
- return 0;
+ if (mr->ibmr.frmr.pool && !mlx5_umr_revoke_mr_with_lock(mr)) {
+ /*
+ * The mkey has been revoked: firmware no longer references
+ * dmabuf_st_index, so release it before this mr can re-enter
+ * the FRMR cache for reuse by another registration.
+ */
+ mlx5_ib_mr_put_dmabuf_st(mr);
+
+ if (!ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr))
+ return 0;
+ }
if (is_odp)
mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
@@ -1400,6 +1478,8 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
dma_resv_unlock(
to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
}
+ if (!ret)
+ mlx5_ib_mr_put_dmabuf_st(mr);
return ret;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
index 997be91f0a13..8929c17c88bc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
@@ -29,7 +29,7 @@ struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev)
u8 direct_mode = 0;
u16 num_entries;
u32 tbl_loc;
- int ret;
+ int ret = 0;
if (!MLX5_CAP_GEN(dev, mkey_pcie_tph))
return NULL;
@@ -92,23 +92,18 @@ void mlx5_st_destroy(struct mlx5_core_dev *dev)
kfree(st);
}
-int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
- unsigned int cpu_uid, u16 *st_index)
+int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
+ u16 *st_index)
{
struct mlx5_st_idx_data *idx_data;
struct mlx5_st *st = dev->st;
unsigned long index;
u32 xa_id;
- u16 tag;
- int ret;
+ int ret = 0;
if (!st)
return -EOPNOTSUPP;
- ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
- if (ret)
- return ret;
-
if (st->direct_mode) {
*st_index = tag;
return 0;
@@ -152,6 +147,20 @@ int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
mutex_unlock(&st->lock);
return ret;
}
+EXPORT_SYMBOL_GPL(mlx5_st_alloc_index_by_tag);
+
+int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
+ unsigned int cpu_uid, u16 *st_index)
+{
+ u16 tag;
+ int ret;
+
+ ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
+ if (ret)
+ return ret;
+
+ return mlx5_st_alloc_index_by_tag(dev, tag, st_index);
+}
EXPORT_SYMBOL_GPL(mlx5_st_alloc_index);
int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
@@ -175,6 +184,7 @@ int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
if (refcount_dec_and_test(&idx_data->usecount)) {
xa_erase(&st->idx_xa, st_index);
+ kfree(idx_data);
/* We leave PCI config space as was before, no mkey will refer to it */
}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 04b96c5abb57..523a9ab0ae1e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1166,10 +1166,17 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type
u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
#ifdef CONFIG_PCIE_TPH
+int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
+ u16 *st_index);
int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
unsigned int cpu_uid, u16 *st_index);
int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index);
#else
+static inline int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev,
+ u16 tag, u16 *st_index)
+{
+ return -EOPNOTSUPP;
+}
static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev,
enum tph_mem_type mem_type,
unsigned int cpu_uid, u16 *st_index)
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 14+ messages in thread* Claude review: RDMA/mlx5: get tph for p2p access when registering dma-buf mr
2026-05-19 20:13 ` [PATCH v4 3/3] RDMA/mlx5: get tph for p2p access when registering dma-buf mr Zhiping Zhang
@ 2026-05-25 12:27 ` Claude Code Review Bot
0 siblings, 0 replies; 14+ messages in thread
From: Claude Code Review Bot @ 2026-05-25 12:27 UTC (permalink / raw)
To: dri-devel-reviews
Patch Review
**kfree leak fix (st.c)**
```c
if (refcount_dec_and_test(&idx_data->usecount)) {
xa_erase(&st->idx_xa, st_index);
+ kfree(idx_data);
/* We leave PCI config space as was before ... */
}
```
Confirmed — the current tree at `st.c:176-179` has `xa_erase` without `kfree`, leaking `idx_data`. This is a real pre-existing bug fix. Should arguably be a separate patch with its own Fixes: tag since it affects existing users of `mlx5_st_dealloc_index`, not just the new dma-buf path.
**Refactoring mlx5_st_alloc_index (st.c)**
The split into `mlx5_st_alloc_index_by_tag` + a thin wrapper `mlx5_st_alloc_index` is clean. The `int ret = 0` initialization silences the clang warning for the `direct_mode` early-return path where `ret` was previously uninitialized. The export is `_GPL`, matching existing conventions.
**get_tph_mr_dmabuf (mr.c)**
```c
req_type = pcie_tph_enabled_req_type(dev->mdev->pdev);
switch (req_type) {
case PCI_TPH_REQ_TPH_ONLY:
st_width = 8;
break;
case PCI_TPH_REQ_EXT_TPH:
st_width = 16;
break;
default:
return;
}
```
This correctly maps the enabled requester type to the ST width for the callback.
**Concern**: After `get_tph` fails, the code does:
```c
if (ret) {
mlx5_ib_dbg(dev, "get_tph failed (%d)\n", ret);
*ph = MLX5_IB_NO_PH;
return;
}
```
This resets `*ph` to `MLX5_IB_NO_PH` but `*st_index` remains `MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX` from the caller's initialization, so the caller won't attempt deallocation. Correct.
**ST index lifecycle (mr.c)**
The `dmabuf_st_owned` flag and `mlx5_ib_mr_put_dmabuf_st` cleanup function correctly handle ownership:
1. On `alloc_cacheable_mr` failure, immediate dealloc before returning error.
2. On success, ownership transferred to MR via `dmabuf_st_owned = 1`.
3. In `mlx5r_handle_mkey_cleanup`, the ST is released after the mkey is revoked but before the MR re-enters the FRMR pool — this ordering is critical and correct.
4. In the non-FRMR cleanup path, `mlx5_ib_mr_put_dmabuf_st` is called after successful mkey cleanup.
```c
if (mr->ibmr.frmr.pool && !mlx5_umr_revoke_mr_with_lock(mr)) {
mlx5_ib_mr_put_dmabuf_st(mr);
if (!ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr))
return 0;
}
```
This ensures the ST index is freed before an MR gets recycled via the FRMR pool, preventing a reused MR from referencing a stale firmware ST entry. Good fix.
**MODULE_IMPORT_NS("DMA_BUF")**
```c
MODULE_IMPORT_NS("DMA_BUF");
```
Required for using dma_buf symbols. Correct.
**Bitfield placement (mlx5_ib.h)**
```c
u8 revoked :1;
u8 dmabuf_faulted:1;
+u8 dmabuf_st_owned:1;
+u16 dmabuf_st_index;
```
Adding these fields inside the anonymous struct within the union is fine, but the `u16 dmabuf_st_index` after three 1-bit `u8` bitfields may introduce padding depending on compiler/ABI. This should be checked — the anonymous struct contains other fields too (`null_mmkey`), so it likely has room, but it's worth verifying that the union size doesn't unexpectedly grow. Not a blocking issue.
**Summary of concerns:**
1. **(Minor)** The `kfree(idx_data)` leak fix in `st.c` is a pre-existing bug — consider splitting it into its own patch with a `Fixes:` tag so it can be backported independently.
2. **(Minor)** The `revoked` type change from bitfield to `bool` in patch 1 is unrelated to TPH and should be mentioned in the commit message.
3. **(Observation)** No `PROBE`/`GET` support for `VFIO_DEVICE_FEATURE_DMA_BUF_TPH` — userspace can't discover whether the feature is available. The `vfio_check_feature` call only allows `SET`. This may be intentional (probe via attempting SET, or probe via the existing DMA_BUF feature), but is worth documenting in the uAPI comment.
---
Generated by Claude Code Patch Reviewer
^ permalink raw reply [flat|nested] 14+ messages in thread
* Claude review: vfio/dma-buf: add TPH support for peer-to-peer access
2026-05-19 20:13 [PATCH v4 0/3] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
` (2 preceding siblings ...)
2026-05-19 20:13 ` [PATCH v4 3/3] RDMA/mlx5: get tph for p2p access when registering dma-buf mr Zhiping Zhang
@ 2026-05-25 12:27 ` Claude Code Review Bot
3 siblings, 0 replies; 14+ messages in thread
From: Claude Code Review Bot @ 2026-05-25 12:27 UTC (permalink / raw)
To: dri-devel-reviews
Overall Series Review
Subject: vfio/dma-buf: add TPH support for peer-to-peer access
Author: Zhiping Zhang <zhipingz@meta.com>
Patches: 8
Reviewed: 2026-05-25T22:27:11.474326
---
This is a well-structured v4 series adding TPH (TLP Processing Hints) support to the VFIO dma-buf export path, enabling importing drivers (mlx5) to use steering tags for peer-to-peer DMA. The series is cleanly split: patch 1 adds the dma-buf callback + VFIO uAPI, patch 2 adds a small PCI/TPH accessor, and patch 3 wires up the mlx5 consumer.
The overall design is sound — write-once TPH metadata with release/acquire ordering, proper fallback to DMAH path, and correct ST index lifetime management. The v4 changelog shows good responsiveness to review feedback (write-once semantics, kfree leak fix, ST index release ordering).
One correctness issue: in patch 3's `get_tph_mr_dmabuf`, the 16-bit ST case writes the returned `u16 steering_tag` into `*st_index` but never reports success/failure to the caller via a return value indicating ST ownership. While this is handled by checking `st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX` downstream, there is a type mismatch in the get_tph callback. See per-patch details below.
---
---
Generated by Claude Code Patch Reviewer
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v3 1/2] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature
2026-05-12 18:47 [PATCH v3 0/2] " Zhiping Zhang
@ 2026-05-12 18:47 ` Zhiping Zhang
2026-05-16 3:07 ` Claude review: " Claude Code Review Bot
0 siblings, 1 reply; 14+ messages in thread
From: Zhiping Zhang @ 2026-05-12 18:47 UTC (permalink / raw)
To: Alex Williamson, Jason Gunthorpe, Leon Romanovsky
Cc: Bjorn Helgaas, kvm, linux-rdma, linux-pci, netdev, dri-devel,
Keith Busch, Yochai Cohen, Yishai Hadas, Zhiping Zhang
Add a dma-buf callback that returns raw TPH metadata from the exporter
so peer devices can reuse the steering tag and processing hint
associated with a VFIO-exported buffer. Add a new
VFIO_DEVICE_FEATURE_DMA_BUF_TPH ioctl that takes the fd from
VFIO_DEVICE_FEATURE_DMA_BUF along with the TPH values, validates the fd
is a vfio-exported dma-buf belonging to this device, and stores the TPH
metadata under memory_lock. The existing VFIO_DEVICE_FEATURE_DMA_BUF
uAPI is unchanged.
8-bit ST and 16-bit Extended ST are distinct namespaces in the PCIe TPH
ST table (firmware reports them as separate fields with separate
validity bits in the ACPI _DSM ST table), so the uAPI carries both
values along with a flags field that indicates which value(s) are
valid for this device. The exporter selects the value that matches the
importer's requested width and returns -EOPNOTSUPP if that width is
not present, instead of substituting a value across namespaces.
Publish the TPH fields under memory_lock and gate readers on a
release/acquire on the flags field; this lets get_tph() run lockless
and avoids inverting the memory_lock -> dma_resv_lock ordering set up
by vfio_pci_dma_buf_move(). Convert the @revoked bitfield to a plain bool
so concurrent updates of @revoked (under dma_resv_lock) and the new TPH
fields (under memory_lock) cannot race on a shared bitfield byte.
Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
---
drivers/vfio/pci/vfio_pci_core.c | 3 +
drivers/vfio/pci/vfio_pci_dmabuf.c | 113 ++++++++++++++++++++++++++++-
drivers/vfio/pci/vfio_pci_priv.h | 11 +++
include/linux/dma-buf.h | 21 ++++++
include/uapi/linux/vfio.h | 35 +++++++++
5 files changed, 182 insertions(+), 1 deletion(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 3f8d093aacf8..94aa6dd95701 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1534,6 +1534,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
case VFIO_DEVICE_FEATURE_DMA_BUF:
return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
+ case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
+ return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
+ argsz);
default:
return -ENOTTY;
}
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index f87fd32e4a01..28247602e359 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -19,7 +19,23 @@ struct vfio_pci_dma_buf {
u32 nr_ranges;
struct kref kref;
struct completion comp;
- u8 revoked : 1;
+ /*
+ * TPH metadata published by VFIO_DEVICE_FEATURE_DMA_BUF_TPH and
+ * consumed by the @get_tph dma-buf callback.
+ *
+ * @tph_flags is the publish/consume gate: writers populate
+ * @steering_tag, @steering_tag_ext and @ph first, then store
+ * @tph_flags with smp_store_release(); readers do
+ * smp_load_acquire(&tph_flags) before accessing the value fields.
+ * @tph_flags == 0 means "TPH not set". Writers serialize via
+ * vdev->memory_lock; readers are lockless to avoid AB-BA against
+ * the dma_resv_lock held by importers.
+ */
+ u32 tph_flags;
+ u16 steering_tag;
+ u16 steering_tag_ext;
+ u8 ph;
+ bool revoked;
};
static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
@@ -69,6 +85,35 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
return ret;
}
+static int vfio_pci_dma_buf_get_tph(struct dma_buf *dmabuf, u16 *steering_tag,
+ u8 *ph, u8 st_width)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+ u32 flags;
+
+ flags = smp_load_acquire(&priv->tph_flags);
+ if (!flags)
+ return -EOPNOTSUPP;
+
+ switch (st_width) {
+ case 8:
+ if (!(flags & VFIO_DMA_BUF_TPH_ST))
+ return -EOPNOTSUPP;
+ *steering_tag = priv->steering_tag;
+ break;
+ case 16:
+ if (!(flags & VFIO_DMA_BUF_TPH_ST_EXT))
+ return -EOPNOTSUPP;
+ *steering_tag = priv->steering_tag_ext;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ *ph = priv->ph;
+ return 0;
+}
+
static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
struct sg_table *sgt,
enum dma_data_direction dir)
@@ -101,6 +146,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
.attach = vfio_pci_dma_buf_attach,
+ .get_tph = vfio_pci_dma_buf_get_tph,
.map_dma_buf = vfio_pci_dma_buf_map,
.unmap_dma_buf = vfio_pci_dma_buf_unmap,
.release = vfio_pci_dma_buf_release,
@@ -331,6 +377,71 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
return ret;
}
+int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
+ u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz)
+{
+ struct vfio_device_feature_dma_buf_tph set_tph;
+ struct vfio_pci_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ int ret;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
+ sizeof(set_tph));
+ if (ret != 1)
+ return ret;
+
+ if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
+ return -EFAULT;
+
+ if (set_tph.reserved[0] || set_tph.reserved[1] || set_tph.reserved[2])
+ return -EINVAL;
+
+ if (set_tph.flags & ~(VFIO_DMA_BUF_TPH_ST | VFIO_DMA_BUF_TPH_ST_EXT))
+ return -EINVAL;
+
+ if (!set_tph.flags)
+ return -EINVAL;
+
+ /* PCIe TLP Processing Hint is a 2-bit field. */
+ if (set_tph.ph & ~0x3)
+ return -EINVAL;
+
+ dmabuf = dma_buf_get(set_tph.dmabuf_fd);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+
+ if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ priv = dmabuf->priv;
+ down_write(&vdev->memory_lock);
+ if (priv->vdev != vdev) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ priv->steering_tag = set_tph.steering_tag;
+ priv->steering_tag_ext = set_tph.steering_tag_ext;
+ priv->ph = set_tph.ph;
+ /*
+ * Publish the TPH values before the gate flag, so that lockless
+ * readers in vfio_pci_dma_buf_get_tph() see fully-initialized
+ * fields once they observe a non-zero tph_flags.
+ */
+ smp_store_release(&priv->tph_flags, set_tph.flags);
+ ret = 0;
+
+out_unlock:
+ up_write(&vdev->memory_lock);
+out_put:
+ dma_buf_put(dmabuf);
+ return ret;
+}
+
void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
{
struct vfio_pci_dma_buf *priv;
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index fca9d0dfac90..200dd061ea5d 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -118,6 +118,10 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
struct vfio_device_feature_dma_buf __user *arg,
size_t argsz);
+int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
+ u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz);
void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
#else
@@ -128,6 +132,13 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
{
return -ENOTTY;
}
+static inline int
+vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz)
+{
+ return -ENOTTY;
+}
static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
{
}
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index d1203da56fc5..d6a1b44052fc 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -113,6 +113,27 @@ struct dma_buf_ops {
*/
void (*unpin)(struct dma_buf_attachment *attach);
+ /**
+ * @get_tph:
+ * @dmabuf: DMA buffer for which to retrieve TPH metadata
+ * @steering_tag: Returns the raw TPH steering tag for @st_width
+ * @ph: Returns the TPH processing hint (2-bit value)
+ * @st_width: Consumer's supported steering tag width in bits (8 or 16)
+ *
+ * Return the TPH (TLP Processing Hints) metadata associated with this
+ * DMA buffer for the requested steering-tag width. 8-bit ST and 16-bit
+ * Extended ST are distinct namespaces in the PCIe TPH ST table, so the
+ * exporter must select the value that matches @st_width and must not
+ * substitute one for the other.
+ *
+ * Return 0 on success, -EOPNOTSUPP if no metadata is available for the
+ * requested width, or -EINVAL if @st_width is not 8 or 16.
+ *
+ * This callback is optional.
+ */
+ int (*get_tph)(struct dma_buf *dmabuf, u16 *steering_tag, u8 *ph,
+ u8 st_width);
+
/**
* @map_dma_buf:
*
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5de618a3a5ee..53b2bbd9fc1e 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1534,6 +1534,41 @@ struct vfio_device_feature_dma_buf {
*/
#define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
+/**
+ * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
+ * with a vfio-exported dma-buf. The dma-buf must have been created by
+ * VFIO_DEVICE_FEATURE_DMA_BUF on this device.
+ *
+ * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
+ *
+ * 8-bit ST (steering_tag) and 16-bit Extended ST (steering_tag_ext) are
+ * distinct namespaces in the PCIe TPH ST table; userspace should populate
+ * the value(s) it has from the firmware ST table for this device and set
+ * the matching VFIO_DMA_BUF_TPH_ST / VFIO_DMA_BUF_TPH_ST_EXT bit in @flags.
+ * An importer requests a specific width and receives the matching value;
+ * if the requested width is not present, the importer is told TPH is
+ * unavailable for this dma-buf.
+ *
+ * ph is the 2-bit TLP Processing Hint and must be in the range [0, 3].
+ *
+ * The user must set TPH on the dma-buf before the importer consumes it.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13
+
+#define VFIO_DMA_BUF_TPH_ST (1 << 0) /* steering_tag valid */
+#define VFIO_DMA_BUF_TPH_ST_EXT (1 << 1) /* steering_tag_ext valid */
+
+struct vfio_device_feature_dma_buf_tph {
+ __s32 dmabuf_fd;
+ __u32 flags;
+ __u16 steering_tag;
+ __u16 steering_tag_ext;
+ __u8 ph;
+ __u8 reserved[3];
+};
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**
--
2.52.0
^ permalink raw reply related [flat|nested] 14+ messages in thread* Claude review: vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature
2026-05-12 18:47 ` [PATCH v3 1/2] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature Zhiping Zhang
@ 2026-05-16 3:07 ` Claude Code Review Bot
0 siblings, 0 replies; 14+ messages in thread
From: Claude Code Review Bot @ 2026-05-16 3:07 UTC (permalink / raw)
To: dri-devel-reviews
Patch Review
**uAPI structure looks correct.** `struct vfio_device_feature_dma_buf_tph` is 16 bytes with proper alignment and 3 reserved bytes for future use:
```c
struct vfio_device_feature_dma_buf_tph {
__s32 dmabuf_fd; /* 0-3 */
__u32 flags; /* 4-7 */
__u16 steering_tag; /* 8-9 */
__u16 steering_tag_ext; /* 10-11 */
__u8 ph; /* 12 */
__u8 reserved[3]; /* 13-15 */
};
```
**Feature number 13 is next available**, following `VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 = 12` in the current tree.
**Input validation is thorough** — reserved fields, unknown flag bits, ph range:
```c
if (set_tph.reserved[0] || set_tph.reserved[1] || set_tph.reserved[2])
return -EINVAL;
if (set_tph.flags & ~(VFIO_DMA_BUF_TPH_ST | VFIO_DMA_BUF_TPH_ST_EXT))
return -EINVAL;
if (!set_tph.flags)
return -EINVAL;
if (set_tph.ph & ~0x3)
return -EINVAL;
```
**The memory ordering is correct.** Writers populate the value fields first, then publish with `smp_store_release()` on `tph_flags`; readers do `smp_load_acquire()` before reading the values. This is a well-established producer-consumer pattern.
**Issue: No revocation check when setting TPH.** In `vfio_pci_core_feature_dma_buf_tph`, the code acquires `memory_lock` and validates `priv->vdev == vdev`, but does not check `priv->revoked`. Setting TPH on a revoked dma-buf is arguably harmless (the get_tph callback would still work, but the buffer can't be mapped), but it seems like a bug to accept a SET ioctl that produces a state that can never be consumed. Consider adding:
```c
if (priv->vdev != vdev) {
ret = -EINVAL;
goto out_unlock;
}
+if (priv->revoked) {
+ ret = -ENODEV;
+ goto out_unlock;
+}
```
Note: accessing `priv->revoked` under `memory_lock` while it's written under `dma_resv_lock` is a data race unless `revoked` transitions monotonically and you treat the check as advisory. Since `revoked` is now a `bool` (not a bitfield), a plain load is fine on architectures where bool stores are atomic, but you might want `READ_ONCE(priv->revoked)` for correctness under KCSAN.
**Issue: No way to unset TPH.** The validation rejects `!set_tph.flags`, so once TPH is set, it cannot be cleared. This may be fine for the current use case, but it's worth documenting as a deliberate choice in the uAPI comment. If a future caller needs to re-set with different values, the current code does support that (subsequent SETs overwrite), but clearing back to "no TPH" is impossible.
**Minor: dma-buf core API change.** Adding `get_tph` to `struct dma_buf_ops` in `include/linux/dma-buf.h` changes the dma-buf framework's interface. This is optional (documented as such), but will need acks from the dma-buf maintainers (Christian König / Sumit Semwal). The comment block is well-written.
**The bitfield-to-bool conversion for `revoked` is correct** — it prevents the compiler from merging a read-modify-write on the bitfield byte across two different locks.
---
Generated by Claude Code Patch Reviewer
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 1/2] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature
2026-04-30 20:06 [PATCH v2 0/2] vfio/dma-buf: add TPH support for peer-to-peer access Zhiping Zhang
@ 2026-04-30 20:06 ` Zhiping Zhang
2026-05-04 23:54 ` Claude review: " Claude Code Review Bot
0 siblings, 1 reply; 14+ messages in thread
From: Zhiping Zhang @ 2026-04-30 20:06 UTC (permalink / raw)
To: Alex Williamson, Jason Gunthorpe, Leon Romanovsky
Cc: Bjorn Helgaas, linux-rdma, linux-pci, netdev, dri-devel,
Keith Busch, Yochai Cohen, Yishai Hadas, Zhiping Zhang
Add a dma-buf callback that returns raw TPH metadata from the exporter
so peer devices can reuse the steering tag and processing hint
associated with a VFIO-exported buffer.
Add a new VFIO_DEVICE_FEATURE_DMA_BUF_TPH ioctl that takes the fd from
VFIO_DEVICE_FEATURE_DMA_BUF along with a steering tag and processing
hint, validates the fd is a vfio-exported dma-buf belonging to this
device, and stores the TPH values under memory_lock. This keeps the
existing VFIO_DEVICE_FEATURE_DMA_BUF uAPI completely unchanged.
The user sequences setting TPH on the dma-buf before the importer
consumes it.
Add an st_width parameter to get_tph() so the exporter can reject
steering tags that exceed the consumer's supported width (8 vs 16 bit).
When no TPH metadata was supplied, get_tph() returns -EOPNOTSUPP.
Signed-off-by: Zhiping Zhang <zhipingz@meta.com>
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1534,6 +1534,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
case VFIO_DEVICE_FEATURE_DMA_BUF:
return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
+ case VFIO_DEVICE_FEATURE_DMA_BUF_TPH:
+ return vfio_pci_core_feature_dma_buf_tph(vdev, flags, arg,
+ argsz);
default:
return -ENOTTY;
}
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -19,6 +19,9 @@ struct vfio_pci_dma_buf {
u32 nr_ranges;
struct kref kref;
struct completion comp;
+ u16 steering_tag;
+ u8 ph;
+ u8 tph_present : 1;
u8 revoked : 1;
};
@@ -69,6 +72,22 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
return ret;
}
+static int vfio_pci_dma_buf_get_tph(struct dma_buf *dmabuf, u16 *steering_tag,
+ u8 *ph, u8 st_width)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+ if (!priv->tph_present)
+ return -EOPNOTSUPP;
+
+ if (st_width < 16 && priv->steering_tag > ((1U << st_width) - 1))
+ return -EINVAL;
+
+ *steering_tag = priv->steering_tag;
+ *ph = priv->ph;
+ return 0;
+}
+
static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
struct sg_table *sgt,
enum dma_data_direction dir)
@@ -101,6 +120,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
.attach = vfio_pci_dma_buf_attach,
+ .get_tph = vfio_pci_dma_buf_get_tph,
.map_dma_buf = vfio_pci_dma_buf_map,
.unmap_dma_buf = vfio_pci_dma_buf_unmap,
.release = vfio_pci_dma_buf_release,
@@ -331,6 +351,55 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
return ret;
}
+int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
+ u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz)
+{
+ struct vfio_device_feature_dma_buf_tph set_tph;
+ struct vfio_pci_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ int ret;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
+ sizeof(set_tph));
+ if (ret != 1)
+ return ret;
+
+ if (copy_from_user(&set_tph, arg, sizeof(set_tph)))
+ return -EFAULT;
+
+ if (set_tph.reserved)
+ return -EINVAL;
+
+ dmabuf = dma_buf_get(set_tph.dmabuf_fd);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+
+ if (dmabuf->ops != &vfio_pci_dmabuf_ops) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ priv = dmabuf->priv;
+ down_write(&vdev->memory_lock);
+ if (priv->vdev != vdev) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ priv->steering_tag = set_tph.steering_tag;
+ priv->ph = set_tph.ph;
+ priv->tph_present = 1;
+ ret = 0;
+
+out_unlock:
+ up_write(&vdev->memory_lock);
+out_put:
+ dma_buf_put(dmabuf);
+ return ret;
+}
+
void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
{
struct vfio_pci_dma_buf *priv;
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -118,6 +118,10 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
struct vfio_device_feature_dma_buf __user *arg,
size_t argsz);
+int vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev,
+ u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz);
void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
#else
@@ -128,6 +132,13 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
{
return -ENOTTY;
}
+static inline int
+vfio_pci_core_feature_dma_buf_tph(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf_tph __user *arg,
+ size_t argsz)
+{
+ return -ENOTTY;
+}
static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
{
}
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -113,6 +113,23 @@ struct dma_buf_ops {
*/
void (*unpin)(struct dma_buf_attachment *attach);
+ /**
+ * @get_tph:
+ * @dmabuf: DMA buffer for which to retrieve TPH metadata
+ * @steering_tag: Returns the raw TPH steering tag
+ * @ph: Returns the TPH processing hint
+ * @st_width: Consumer's supported steering tag width in bits (8 or 16)
+ *
+ * Return the TPH (TLP Processing Hints) metadata associated with this
+ * DMA buffer. Exporters that do not provide TPH metadata should return
+ * -EOPNOTSUPP. If the steering tag exceeds @st_width bits, return
+ * -EINVAL.
+ *
+ * This callback is optional.
+ */
+ int (*get_tph)(struct dma_buf *dmabuf, u16 *steering_tag, u8 *ph,
+ u8 st_width);
+
/**
* @map_dma_buf:
*
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1534,6 +1534,28 @@ struct vfio_device_feature_dma_buf {
*/
#define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12
+/**
+ * Upon VFIO_DEVICE_FEATURE_SET associate TPH (TLP Processing Hints) metadata
+ * with a vfio-exported dma-buf. The dma-buf must have been created by
+ * VFIO_DEVICE_FEATURE_DMA_BUF on this device.
+ *
+ * dmabuf_fd is the file descriptor returned by VFIO_DEVICE_FEATURE_DMA_BUF.
+ * steering_tag and ph are the raw TPH values that importing drivers should use
+ * when accessing the buffer.
+ *
+ * The user must set TPH on the dma-buf before the importer consumes it.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+#define VFIO_DEVICE_FEATURE_DMA_BUF_TPH 13
+
+struct vfio_device_feature_dma_buf_tph {
+ __s32 dmabuf_fd;
+ __u16 steering_tag;
+ __u8 ph;
+ __u8 reserved;
+};
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**
^ permalink raw reply [flat|nested] 14+ messages in thread* Claude review: vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature
2026-04-30 20:06 ` [PATCH v2 1/2] vfio: add dma-buf get_tph callback and DMA_BUF_TPH feature Zhiping Zhang
@ 2026-05-04 23:54 ` Claude Code Review Bot
0 siblings, 0 replies; 14+ messages in thread
From: Claude Code Review Bot @ 2026-05-04 23:54 UTC (permalink / raw)
To: dri-devel-reviews
Patch Review
**Missing `ph` validation in the SET ioctl**
The `ph` (processing hint) field is accepted as a raw `__u8` with no validation. PCIe TLP Processing Hints are a 2-bit field (values 0–3). An out-of-range value would be silently stored and returned to importers:
```c
priv->steering_tag = set_tph.steering_tag;
priv->ph = set_tph.ph;
priv->tph_present = 1;
```
Consider adding a check like `if (set_tph.ph > 3) return -EINVAL;` before storing. Otherwise a malicious/buggy userspace can inject arbitrary PH values that importers would trust.
**get_tph callback reads fields without locking or barriers**
`vfio_pci_dma_buf_get_tph()` reads `tph_present`, `steering_tag`, and `ph` without holding any lock:
```c
if (!priv->tph_present)
return -EOPNOTSUPP;
...
*steering_tag = priv->steering_tag;
*ph = priv->ph;
```
While the SET path holds `memory_lock`, the read path does not. The commit message states "The user sequences setting TPH on the dma-buf before the importer consumes it," so in the expected usage there's no race. However, there's no compiler or CPU barrier ensuring the importer sees the stores in order. Consider using `smp_load_acquire`/`smp_store_release` on `tph_present`, or at minimum adding a comment documenting the caller-enforced ordering requirement. The dma-buf core doc for the callback should probably mention this too.
**Bitfield adjacency is safe under memory_lock**
`tph_present` and `revoked` are adjacent bitfields sharing a byte. Both the SET TPH ioctl and `vfio_pci_dma_buf_move` hold `memory_lock` when writing their respective bits, so there's no bitfield tearing risk between those paths. This is correct.
**st_width boundary: what values are valid?**
The `get_tph` callback accepts arbitrary `st_width`:
```c
if (st_width < 16 && priv->steering_tag > ((1U << st_width) - 1))
return -EINVAL;
```
The documentation says "8 or 16," but nothing in the callback or its callers enforces that. If `st_width == 0` is passed, this evaluates to `(1U << 0) - 1 = 0`, rejecting any nonzero tag. This won't cause a crash but may be surprising. Consider a brief comment or a check that `st_width` is one of the expected values.
**uAPI struct looks good**
```c
struct vfio_device_feature_dma_buf_tph {
__s32 dmabuf_fd;
__u16 steering_tag;
__u8 ph;
__u8 reserved;
};
```
8 bytes, naturally aligned, reserved field validated to zero, SET-only. Feature number 13 follows sequentially after 12 (`MIG_PRECOPY_INFOv2`). Clean.
**Stub for `!CONFIG_VFIO_PCI_DMABUF` looks correct**
The `#else` stub returning `-ENOTTY` mirrors the existing `vfio_pci_core_feature_dma_buf` stub.
---
Generated by Claude Code Patch Reviewer
^ permalink raw reply [flat|nested] 14+ messages in thread