[PATCH v1] misc: fastrpc: fix context leak and hang on signal-interrupted invoke

public inbox for drm-ai-reviews@public-inbox.freedesktop.org
 help / color / mirror / Atom feed

* [PATCH v1] misc: fastrpc: fix context leak and hang on signal-interrupted invoke
@ 2026-05-25 12:42 Anandu Krishnan E
  2026-05-25 13:28 ` Dmitry Baryshkov
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Anandu Krishnan E @ 2026-05-25 12:42 UTC (permalink / raw)
  To: srini, linux-arm-msm
  Cc: gregkh, quic_bkumar, linux-kernel, quic_chennak, dri-devel, arnd,
	ekansh.gupta, stable

fastrpc invokes work by sending an RPC message to the DSP and blocking
in wait_for_completion_interruptible() until the DSP responds. If a
signal arrives during this wait, the syscall returns -ERESTARTSYS and
the invoke context which holds the in-flight DMA buffers and
completion state is left stranded in fl->pending.

On the next syscall attempt (either auto-restarted by the kernel via
SA_RESTART or manually retried by user-space after EINTR), a fresh
context is allocated and the RPC message is re-sent to the DSP. This
has two consequences:

  - The original context leaks in fl->pending until the file is closed.
  - The DSP receives a duplicate invocation. If the DSP was mid-way
    through processing the first request and had issued a reverse RPC
    call back to the host, the retry sends a new forward request
    instead of the expected reverse-RPC response. The DSP thread
    waiting for that response is never woken, causing a hang.

Fix this by saving the interrupted context to a new fl->interrupted
list on -ERESTARTSYS. When the same thread retries the invoke with a
matching sc, restore the context and jump directly to the wait,
skipping context allocation and message re-send.

Also drain fl->interrupted on process exit and complete any sleeping
contexts with -EPIPE when the rpmsg channel is removed.

Fixes: 387f625585d1 ("misc: fastrpc: handle interrupted contexts")
Cc: stable@kernel.org
Signed-off-by: Anandu Krishnan E <anandu.e@oss.qualcomm.com>
---
 drivers/misc/fastrpc.c | 69 ++++++++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 16 deletions(-)

diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index 1080f9acf70a..22d0b0592c10 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -280,7 +280,6 @@ struct fastrpc_channel_ctx {
 	struct fastrpc_device *secure_fdevice;
 	struct fastrpc_device *fdevice;
 	struct fastrpc_buf *remote_heap;
-	struct list_head invoke_interrupted_mmaps;
 	bool secure;
 	bool unsigned_support;
 	u64 dma_mask;
@@ -297,6 +296,7 @@ struct fastrpc_user {
 	struct list_head user;
 	struct list_head maps;
 	struct list_head pending;
+	struct list_head interrupted;
 	struct list_head mmaps;
 
 	struct fastrpc_channel_ctx *cctx;
@@ -542,6 +542,36 @@ static void fastrpc_context_put_wq(struct work_struct *work)
 	fastrpc_context_put(ctx);
 }
 
+static void fastrpc_context_save_interrupted(struct fastrpc_invoke_ctx *ctx)
+{
+	spin_lock(&ctx->fl->lock);
+	list_del(&ctx->node);
+	list_add_tail(&ctx->node, &ctx->fl->interrupted);
+	spin_unlock(&ctx->fl->lock);
+}
+
+static struct fastrpc_invoke_ctx *fastrpc_context_restore_interrupted(
+			struct fastrpc_user *fl, u32 sc)
+{
+	struct fastrpc_invoke_ctx *ctx = NULL, *ictx, *n;
+
+	spin_lock(&fl->lock);
+	list_for_each_entry_safe(ictx, n, &fl->interrupted, node) {
+		if (ictx->pid != current->pid)
+			continue;
+		if (ictx->sc != sc || ictx->fl != fl) {
+			spin_unlock(&fl->lock);
+			return ERR_PTR(-EINVAL);
+		}
+		ctx = ictx;
+		list_del(&ctx->node);
+		list_add_tail(&ctx->node, &fl->pending);
+		break;
+	}
+	spin_unlock(&fl->lock);
+	return ctx;
+}
+
 #define CMP(aa, bb) ((aa) == (bb) ? 0 : (aa) < (bb) ? -1 : 1)
 static int olaps_cmp(const void *a, const void *b)
 {
@@ -1197,8 +1227,6 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 				   struct fastrpc_invoke_args *args)
 {
 	struct fastrpc_invoke_ctx *ctx = NULL;
-	struct fastrpc_buf *buf, *b;
-
 	int err = 0;
 
 	if (!fl->sctx)
@@ -1212,6 +1240,14 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 		return -EPERM;
 	}
 
+	if (!kernel) {
+		ctx = fastrpc_context_restore_interrupted(fl, sc);
+		if (IS_ERR(ctx))
+			return PTR_ERR(ctx);
+		if (ctx)
+			goto wait;
+	}
+
 	ctx = fastrpc_context_alloc(fl, kernel, sc, args);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
@@ -1227,6 +1263,7 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 	if (err)
 		goto bail;
 
+wait:
 	if (kernel) {
 		if (!wait_for_completion_timeout(&ctx->work, 10 * HZ))
 			err = -ETIMEDOUT;
@@ -1250,7 +1287,9 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 		goto bail;
 
 bail:
-	if (err != -ERESTARTSYS && err != -ETIMEDOUT) {
+	if (ctx && err == -ERESTARTSYS) {
+		fastrpc_context_save_interrupted(ctx);
+	} else if (ctx && err != -ETIMEDOUT) {
 		/* We are done with this compute context */
 		spin_lock(&fl->lock);
 		list_del(&ctx->node);
@@ -1258,13 +1297,6 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 		fastrpc_context_put(ctx);
 	}
 
-	if (err == -ERESTARTSYS) {
-		list_for_each_entry_safe(buf, b, &fl->mmaps, node) {
-			list_del(&buf->node);
-			list_add_tail(&buf->node, &fl->cctx->invoke_interrupted_mmaps);
-		}
-	}
-
 	if (err)
 		dev_dbg(fl->sctx->dev, "Error: Invoke Failed %d\n", err);
 
@@ -1598,6 +1630,11 @@ static int fastrpc_device_release(struct inode *inode, struct file *file)
 		fastrpc_context_put(ctx);
 	}
 
+	list_for_each_entry_safe(ctx, n, &fl->interrupted, node) {
+		list_del(&ctx->node);
+		fastrpc_context_put(ctx);
+	}
+
 	list_for_each_entry_safe(map, m, &fl->maps, node)
 		fastrpc_map_put(map);
 
@@ -1637,6 +1674,7 @@ static int fastrpc_device_open(struct inode *inode, struct file *filp)
 	spin_lock_init(&fl->lock);
 	mutex_init(&fl->mutex);
 	INIT_LIST_HEAD(&fl->pending);
+	INIT_LIST_HEAD(&fl->interrupted);
 	INIT_LIST_HEAD(&fl->maps);
 	INIT_LIST_HEAD(&fl->mmaps);
 	INIT_LIST_HEAD(&fl->user);
@@ -2435,7 +2473,6 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev)
 	rdev->dma_mask = &data->dma_mask;
 	dma_set_mask_and_coherent(rdev, DMA_BIT_MASK(32));
 	INIT_LIST_HEAD(&data->users);
-	INIT_LIST_HEAD(&data->invoke_interrupted_mmaps);
 	spin_lock_init(&data->lock);
 	idr_init(&data->ctx_idr);
 	data->domain_id = domain_id;
@@ -2467,13 +2504,16 @@ static void fastrpc_notify_users(struct fastrpc_user *user)
 		ctx->retval = -EPIPE;
 		complete(&ctx->work);
 	}
+	list_for_each_entry(ctx, &user->interrupted, node) {
+		ctx->retval = -EPIPE;
+		complete(&ctx->work);
+	}
 	spin_unlock(&user->lock);
 }
 
 static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev)
 {
 	struct fastrpc_channel_ctx *cctx = dev_get_drvdata(&rpdev->dev);
-	struct fastrpc_buf *buf, *b;
 	struct fastrpc_user *user;
 	unsigned long flags;
 
@@ -2490,9 +2530,6 @@ static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev)
 	if (cctx->secure_fdevice)
 		misc_deregister(&cctx->secure_fdevice->miscdev);
 
-	list_for_each_entry_safe(buf, b, &cctx->invoke_interrupted_mmaps, node)
-		list_del(&buf->node);
-
 	if (cctx->remote_heap)
 		fastrpc_buf_free(cctx->remote_heap);
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v1] misc: fastrpc: fix context leak and hang on signal-interrupted invoke
  2026-05-25 12:42 [PATCH v1] misc: fastrpc: fix context leak and hang on signal-interrupted invoke Anandu Krishnan E
@ 2026-05-25 13:28 ` Dmitry Baryshkov
  2026-05-25 21:14 ` Claude review: " Claude Code Review Bot
  2026-05-25 21:14 ` Claude Code Review Bot
  2 siblings, 0 replies; 4+ messages in thread
From: Dmitry Baryshkov @ 2026-05-25 13:28 UTC (permalink / raw)
  To: Anandu Krishnan E
  Cc: srini, linux-arm-msm, gregkh, quic_bkumar, linux-kernel,
	quic_chennak, dri-devel, arnd, ekansh.gupta, stable

On Mon, May 25, 2026 at 06:12:22PM +0530, Anandu Krishnan E wrote:
> fastrpc invokes work by sending an RPC message to the DSP and blocking
> in wait_for_completion_interruptible() until the DSP responds. If a
> signal arrives during this wait, the syscall returns -ERESTARTSYS and
> the invoke context which holds the in-flight DMA buffers and
> completion state is left stranded in fl->pending.
> 
> On the next syscall attempt (either auto-restarted by the kernel via
> SA_RESTART or manually retried by user-space after EINTR), a fresh
> context is allocated and the RPC message is re-sent to the DSP. This
> has two consequences:
> 
>   - The original context leaks in fl->pending until the file is closed.
>   - The DSP receives a duplicate invocation. If the DSP was mid-way
>     through processing the first request and had issued a reverse RPC
>     call back to the host, the retry sends a new forward request
>     instead of the expected reverse-RPC response. The DSP thread
>     waiting for that response is never woken, causing a hang.
> 
> Fix this by saving the interrupted context to a new fl->interrupted
> list on -ERESTARTSYS. When the same thread retries the invoke with a
> matching sc, restore the context and jump directly to the wait,
> skipping context allocation and message re-send.

What if the userspace doesn't honour -ERESTARTSYS and submits a new
workload?

> 
> Also drain fl->interrupted on process exit and complete any sleeping
> contexts with -EPIPE when the rpmsg channel is removed.
> 
> Fixes: 387f625585d1 ("misc: fastrpc: handle interrupted contexts")
> Cc: stable@kernel.org
> Signed-off-by: Anandu Krishnan E <anandu.e@oss.qualcomm.com>
> ---
>  drivers/misc/fastrpc.c | 69 ++++++++++++++++++++++++++++++++----------
>  1 file changed, 53 insertions(+), 16 deletions(-)
> 

-- 
With best wishes
Dmitry

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Claude review: misc: fastrpc: fix context leak and hang on signal-interrupted invoke
  2026-05-25 12:42 [PATCH v1] misc: fastrpc: fix context leak and hang on signal-interrupted invoke Anandu Krishnan E
  2026-05-25 13:28 ` Dmitry Baryshkov
  2026-05-25 21:14 ` Claude review: " Claude Code Review Bot
@ 2026-05-25 21:14 ` Claude Code Review Bot
  2 siblings, 0 replies; 4+ messages in thread
From: Claude Code Review Bot @ 2026-05-25 21:14 UTC (permalink / raw)
  To: dri-devel-reviews

Overall Series Review

Subject: misc: fastrpc: fix context leak and hang on signal-interrupted invoke
Author: Anandu Krishnan E <anandu.e@oss.qualcomm.com>
Patches: 2
Reviewed: 2026-05-26T07:14:36.687555

---

This is a single-patch fix for a real and well-described bug: when a signal interrupts `wait_for_completion_interruptible()` during a fastrpc invoke, the context leaks and retrying causes DSP hangs due to duplicate invocations. The approach of saving interrupted contexts on a per-user list and restoring them on retry is reasonable and follows the existing code patterns.

The patch also replaces the old `invoke_interrupted_mmaps` mechanism (which only saved DMA buffers at the channel level) with context-level preservation at the user level — a more correct granularity.

However, there are several issues ranging from a potential correctness bug to missing handling for edge cases.

**Verdict: Needs revision** — the matching logic has a correctness issue with PID vs TID semantics, the `handle` parameter is not checked in the restore path, and the mmaps leak concern is not addressed.

---
Generated by Claude Code Patch Reviewer

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Claude review: misc: fastrpc: fix context leak and hang on signal-interrupted invoke
  2026-05-25 12:42 [PATCH v1] misc: fastrpc: fix context leak and hang on signal-interrupted invoke Anandu Krishnan E
  2026-05-25 13:28 ` Dmitry Baryshkov
@ 2026-05-25 21:14 ` Claude Code Review Bot
  2026-05-25 21:14 ` Claude Code Review Bot
  2 siblings, 0 replies; 4+ messages in thread
From: Claude Code Review Bot @ 2026-05-25 21:14 UTC (permalink / raw)
  To: dri-devel-reviews

Patch Review

**1. PID matching uses thread ID, not process ID — naming confusion and potential multi-thread issue**

```c
+static struct fastrpc_invoke_ctx *fastrpc_context_restore_interrupted(
+			struct fastrpc_user *fl, u32 sc)
+{
+	struct fastrpc_invoke_ctx *ctx = NULL, *ictx, *n;
+
+	spin_lock(&fl->lock);
+	list_for_each_entry_safe(ictx, n, &fl->interrupted, node) {
+		if (ictx->pid != current->pid)
+			continue;
+		if (ictx->sc != sc || ictx->fl != fl) {
+			spin_unlock(&fl->lock);
+			return ERR_PTR(-EINVAL);
+		}
```

`ctx->pid` is set from `current->pid` (which is the kernel thread ID / `gettid()`, not the userspace `getpid()` concept). The `fl` (fastrpc_user) is per-fd, so if multiple threads share the same fd, multiple threads can have entries on `fl->interrupted`. The matching by `current->pid` (really TID) is correct for identifying the right thread's context.

However, the error path is problematic: if a thread finds an interrupted context with matching TID but **different** `sc`, it returns `-EINVAL` and the lock is released. But what about the case where a thread was interrupted on one `sc` and now calls with a different `sc`? That's a legitimate scenario (the userspace decided to abandon the old call and make a new one). Returning `-EINVAL` prevents the thread from making **any** new invoke call until the file is closed. The old interrupted context also remains stranded on the list. A better approach might be to free the stale interrupted context and let the new call proceed.

Also, the `ictx->fl != fl` check is redundant — all contexts on `fl->interrupted` have `fl` as their user by construction.

**2. Missing `handle` parameter validation in the restore path**

```c
+	if (!kernel) {
+		ctx = fastrpc_context_restore_interrupted(fl, sc);
+		if (IS_ERR(ctx))
+			return PTR_ERR(ctx);
+		if (ctx)
+			goto wait;
+	}
```

The restore only matches on `sc` (scalars descriptor), but `fastrpc_internal_invoke` also takes a `handle` parameter. A restored context could have a different handle than the retry. The `handle` is not stored in `fastrpc_invoke_ctx`, so it can't be validated currently, but this should be considered — if userspace retries with the same `sc` but a different `handle`, the wrong context would be restored.

**3. mmaps handling removed without replacement**

The old code moved `fl->mmaps` to `cctx->invoke_interrupted_mmaps` on `-ERESTARTSYS`:

```c
-	if (err == -ERESTARTSYS) {
-		list_for_each_entry_safe(buf, b, &fl->mmaps, node) {
-			list_del(&buf->node);
-			list_add_tail(&buf->node, &fl->cctx->invoke_interrupted_mmaps);
-		}
-	}
```

The patch removes this but doesn't add any equivalent handling for the mmaps. The context is preserved, but what about the DMA buffers on `fl->mmaps` that were associated with the invoke? If a retry happens and goes through `fastrpc_get_args` again (which won't happen due to the `goto wait`, but if the context is eventually freed without completing, the mmaps might be orphaned). It would be good to clarify in the commit message why the mmap migration is no longer needed — presumably because the context itself now survives and holds references to the maps, but this should be explicitly stated.

**4. `fastrpc_notify_users` iterates `interrupted` list and calls `complete()` — but nobody is waiting**

```c
+	list_for_each_entry(ctx, &user->interrupted, node) {
+		ctx->retval = -EPIPE;
+		complete(&ctx->work);
+	}
```

Contexts on the `interrupted` list are by definition **not** being waited on — the thread returned `-ERESTARTSYS` and left the wait. Calling `complete()` here sets the completion so that if the thread retries and enters the wait again, it will immediately see `-EPIPE`. This is functional, but subtle — it relies on the thread re-entering `wait_for_completion_interruptible()` after the context is restored. If the thread never retries (e.g., process is exiting), the context is cleaned up in `fastrpc_device_release`. This seems correct but deserves a comment explaining the intent.

**5. Bail path logic change alters `-ETIMEDOUT` handling**

Original:
```c
	if (err != -ERESTARTSYS && err != -ETIMEDOUT) {
		/* We are done with this compute context */
		spin_lock(&fl->lock);
		list_del(&ctx->node);
		spin_unlock(&fl->lock);
		fastrpc_context_put(ctx);
	}
```

New:
```c
+	if (ctx && err == -ERESTARTSYS) {
+		fastrpc_context_save_interrupted(ctx);
+	} else if (ctx && err != -ETIMEDOUT) {
```

The `ctx &&` guard is added, which is necessary because the restore path could hypothetically have `ctx == NULL` by the time we reach bail... except it can't, since if `ctx` is NULL the function would have proceeded to `fastrpc_context_alloc` which always assigns `ctx`. Actually, reviewing the flow: if `fastrpc_context_restore_interrupted` returns NULL, we fall through to `fastrpc_context_alloc`, so `ctx` is always non-NULL at `bail`. The guard is harmless but unnecessary for non-kernel paths. For the kernel path, `ctx` was always non-NULL in the original code too. So the `ctx &&` check is defensive but not required.

**6. `list_for_each_entry_safe` may not be needed in the restore function**

```c
+	list_for_each_entry_safe(ictx, n, &fl->interrupted, node) {
```

Since the function `break`s after finding the first match (and the early return on mismatch also exits the loop), `list_for_each_entry_safe` isn't strictly needed — `list_for_each_entry` would suffice since at most one `list_del` happens before exiting. This is a minor style point.

**7. Style: unnecessary braces around single statement**

```c
+	if (ctx && err == -ERESTARTSYS) {
+		fastrpc_context_save_interrupted(ctx);
+	} else if (ctx && err != -ETIMEDOUT) {
```

The kernel coding style generally doesn't require braces for single-statement `if` bodies, though this is minor and some subsystems tolerate it in `if/else if` chains for visual consistency.

**Summary of requested changes:**

1. **(Important)** Reconsider the `-EINVAL` return when `sc` doesn't match — either free the stale context and allow the new call, or at minimum document why this restriction exists.
2. **(Important)** Consider validating the `handle` in the restore path, or document why `sc` alone is sufficient for correctness.
3. **(Minor)** Remove the redundant `ictx->fl != fl` check.
4. **(Minor)** Add a brief comment in `fastrpc_notify_users` explaining why `complete()` is called on contexts that aren't currently being waited on.
5. **(Minor)** Explain in the commit message why the mmap migration to `invoke_interrupted_mmaps` is no longer needed.

---
Generated by Claude Code Patch Reviewer

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-05-25 21:14 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-25 12:42 [PATCH v1] misc: fastrpc: fix context leak and hang on signal-interrupted invoke Anandu Krishnan E
2026-05-25 13:28 ` Dmitry Baryshkov
2026-05-25 21:14 ` Claude review: " Claude Code Review Bot
2026-05-25 21:14 ` Claude Code Review Bot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox