public inbox for drm-ai-reviews@public-inbox.freedesktop.org
 help / color / mirror / Atom feed
From: Marek Czernohous <mczernohous@gmail.com>
To: nouveau@lists.freedesktop.org
Cc: Lyude Paul <lyude@redhat.com>, Danilo Krummrich <dakr@kernel.org>,
	dri-devel@lists.freedesktop.org, linux-kernel@vger.kernel.org,
	Marek Czernohous <marek@czernohous.de>
Subject: [PATCH 2/2] drm/nouveau/fifo: add recovery path for Tesla cache_error/dma_pusher
Date: Wed, 13 May 2026 19:50:13 +0200	[thread overview]
Message-ID: <20260513175014.96599-3-marek@czernohous.de> (raw)
In-Reply-To: <20260513175014.96599-1-marek@czernohous.de>

On Tesla / NV50 family chipsets (nv50, g84, g94, g98, mcp77, mcp79),
FIFO fault handling in nv04_fifo_intr_cache_error and
nv04_fifo_intr_dma_pusher logs the fault and resets hardware registers
but leaves the offending channel running. Compared to Fermi+ which
calls nvkm_chan_error from nvkm_runl_rc, Tesla has no escalation:
silent state corruption is possible, no telemetry beyond dmesg, and
repeated faults on the same channel keep firing forever.

Add a shared recovery helper nv04_fifo_recover() that both intr
handlers call after the existing logging+reset sequence. It implements
two tiers:

  Tier-1 (per-fault): nvkm_chan_get_chid + nvkm_chan_error(chan, true).
  The atomic chan->errored short-circuit means re-faults on the same
  channel are no-op; other channels are unaffected.

  Tier-2 (sliding-window): per-fifo lock-protected ring of fault
  timestamps. When the count within fifo_wedge_window_ms reaches
  fifo_wedge_count, schedule a workqueue job that emits a
  drm_dev_wedged_event with DRM_WEDGE_RECOVERY_REBIND. The
  drm_dev_wedged_event call cannot run from IRQ context because
  kobject_uevent_env may sleep; the workqueue indirection handles this.

Tracepoints (nouveau:fifo_chan_killed, nouveau:fifo_dev_wedged) provide
zero-overhead telemetry consumable via perf or bpftrace.

Module parameters fifo_wedge_count (default 10, range 0..32, 0=Tier-2
disabled) and fifo_wedge_window_ms (default 60000, range 100..600000)
allow tuning without rebuild.

Validated on Apple Mac mini Late 2009 (NVAC, MCP79).

Signed-off-by: Marek Czernohous <marek@czernohous.de>
---
 .../drm/nouveau/include/nvkm/engine/fifo.h    |  12 ++
 .../include/trace/events/nouveau_fifo.h       |  58 +++++++++
 drivers/gpu/drm/nouveau/nouveau_drm.c         |  29 +++++
 .../gpu/drm/nouveau/nvkm/engine/fifo/Kbuild   |   1 +
 .../gpu/drm/nouveau/nvkm/engine/fifo/base.c   |   3 +
 .../gpu/drm/nouveau/nvkm/engine/fifo/nv04.c   |   4 +
 .../gpu/drm/nouveau/nvkm/engine/fifo/priv.h   |  10 ++
 .../drm/nouveau/nvkm/engine/fifo/recover.c    | 121 ++++++++++++++++++
 8 files changed, 238 insertions(+)
 create mode 100644 drivers/gpu/drm/nouveau/include/trace/events/nouveau_fifo.h
 create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/fifo/recover.c

diff --git a/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h b/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h
index 96c16cfccf16..7c27b4c8a212 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h
@@ -55,6 +55,17 @@ void nvkm_chan_put(struct nvkm_chan **, unsigned long irqflags);
 
 struct nvkm_chan *nvkm_uchan_chan(struct nvkm_object *);
 
+#define NVKM_FIFO_WEDGE_RING_MAX 32
+
+struct nvkm_fifo_wedge {
+	spinlock_t       lock;
+	u32              count;                                  /* aktuelle Fenster-Tiefe */
+	ktime_t          ts[NVKM_FIFO_WEDGE_RING_MAX];          /* Ring von Timestamps */
+	u32              head;                                   /* Ring-Head */
+	struct work_struct work;                                /* schedules drm_dev_wedged_event */
+	atomic_t         wedged;                                /* Tier-2 already fired? */
+};
+
 struct nvkm_fifo {
 	const struct nvkm_fifo_func *func;
 	struct nvkm_engine engine;
@@ -86,6 +97,7 @@ struct nvkm_fifo {
 
 	spinlock_t lock;
 	struct mutex mutex;
+	struct nvkm_fifo_wedge wedge;
 };
 
 void nvkm_fifo_fault(struct nvkm_fifo *, struct nvkm_fault_data *);
diff --git a/drivers/gpu/drm/nouveau/include/trace/events/nouveau_fifo.h b/drivers/gpu/drm/nouveau/include/trace/events/nouveau_fifo.h
new file mode 100644
index 000000000000..46d043a82850
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/trace/events/nouveau_fifo.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: MIT */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nouveau
+
+#if !defined(_TRACE_NOUVEAU_FIFO_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NOUVEAU_FIFO_H
+
+#include <linux/tracepoint.h>
+#include <drm/drm_device.h>
+
+TRACE_EVENT(nouveau_fifo_chan_killed,
+	TP_PROTO(struct drm_device *dev, u32 chid, u32 fault_type, u64 info),
+	TP_ARGS(dev, chid, fault_type, info),
+	TP_STRUCT__entry(
+		__string(devname, dev_name(dev->dev))
+		__field(u32, chid)
+		__field(u32, fault_type)
+		__field(u64, info)
+	),
+	TP_fast_assign(
+		__assign_str(devname);
+		__entry->chid = chid;
+		__entry->fault_type = fault_type;
+		__entry->info = info;
+	),
+	TP_printk("dev=%s chid=%u fault=%s info=0x%llx",
+		__get_str(devname),
+		__entry->chid,
+		__entry->fault_type == 0 ? "CACHE_ERROR" : "DMA_PUSHER",
+		__entry->info)
+);
+
+TRACE_EVENT(nouveau_fifo_dev_wedged,
+	TP_PROTO(struct drm_device *dev, u32 fault_count, u32 window_ms),
+	TP_ARGS(dev, fault_count, window_ms),
+	TP_STRUCT__entry(
+		__string(devname, dev_name(dev->dev))
+		__field(u32, fault_count)
+		__field(u32, window_ms)
+	),
+	TP_fast_assign(
+		__assign_str(devname);
+		__entry->fault_count = fault_count;
+		__entry->window_ms = window_ms;
+	),
+	TP_printk("dev=%s wedged after %u faults in %u ms",
+		__get_str(devname),
+		__entry->fault_count,
+		__entry->window_ms)
+);
+
+#endif /* _TRACE_NOUVEAU_FIFO_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../drivers/gpu/drm/nouveau/include/trace/events
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE nouveau_fifo
+#include <trace/define_trace.h>
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 517ff2c31dce..c62b7fc3a1d3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -22,6 +22,8 @@
  * Authors: Ben Skeggs
  */
 
+#define CREATE_TRACE_POINTS
+
 #include <linux/aperture.h>
 #include <linux/delay.h>
 #include <linux/module.h>
@@ -74,6 +76,9 @@
 #include "nouveau_uvmm.h"
 #include "nouveau_sched.h"
 
+#include <engine/fifo.h>
+#include <trace/events/nouveau_fifo.h>
+
 DECLARE_DYNDBG_CLASSMAP(drm_debug_classes, DD_CLASS_TYPE_DISJOINT_BITS, 0,
 			"DRM_UT_CORE",
 			"DRM_UT_DRIVER",
@@ -111,6 +116,18 @@ MODULE_PARM_DESC(runpm, "disable (0), force enable (1), optimus only default (-1
 static int nouveau_runtime_pm = -1;
 module_param_named(runpm, nouveau_runtime_pm, int, 0400);
 
+MODULE_PARM_DESC(fifo_wedge_count,
+	"FIFO faults within window before drm_dev_wedged_event "
+	"(0=disable Tier-2, max 32, default 10)");
+unsigned int nouveau_fifo_wedge_count = 10;
+module_param_named(fifo_wedge_count, nouveau_fifo_wedge_count, uint, 0400);
+
+MODULE_PARM_DESC(fifo_wedge_window_ms,
+	"Sliding-window width in milliseconds for fifo_wedge_count "
+	"(default 60000)");
+unsigned int nouveau_fifo_wedge_window_ms = 60000;
+module_param_named(fifo_wedge_window_ms, nouveau_fifo_wedge_window_ms, uint, 0400);
+
 static struct drm_driver driver_stub;
 static struct drm_driver driver_pci;
 static struct drm_driver driver_platform;
@@ -1495,6 +1512,18 @@ nouveau_drm_init(void)
 	if (!nouveau_modeset)
 		return 0;
 
+	if (nouveau_fifo_wedge_count > NVKM_FIFO_WEDGE_RING_MAX) {
+		pr_warn("nouveau: fifo_wedge_count=%u exceeds max %u; clamping\n",
+			nouveau_fifo_wedge_count, NVKM_FIFO_WEDGE_RING_MAX);
+		nouveau_fifo_wedge_count = NVKM_FIFO_WEDGE_RING_MAX;
+	}
+	if (nouveau_fifo_wedge_window_ms < 100 ||
+	    nouveau_fifo_wedge_window_ms > 600000) {
+		pr_warn("nouveau: fifo_wedge_window_ms=%u out of range; resetting to 60000\n",
+			nouveau_fifo_wedge_window_ms);
+		nouveau_fifo_wedge_window_ms = 60000;
+	}
+
 	nouveau_module_debugfs_init();
 
 #ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild
index 376e9c3bcb1a..1ff29753731d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/Kbuild
@@ -5,6 +5,7 @@ nvkm-y += nvkm/engine/fifo/chan.o
 nvkm-y += nvkm/engine/fifo/chid.o
 nvkm-y += nvkm/engine/fifo/runl.o
 nvkm-y += nvkm/engine/fifo/runq.o
+nvkm-y += nvkm/engine/fifo/recover.o
 
 nvkm-y += nvkm/engine/fifo/nv04.o
 nvkm-y += nvkm/engine/fifo/nv10.o
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
index 9dd924694306..a61183fa38af 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
@@ -337,6 +337,8 @@ nvkm_fifo_dtor(struct nvkm_engine *engine)
 	struct nvkm_runl *runl, *runt;
 	struct nvkm_runq *runq, *rtmp;
 
+	nv04_fifo_wedge_fini(fifo);
+
 	if (fifo->userd.bar1)
 		nvkm_vmm_put(nvkm_bar_bar1_vmm(engine->subdev.device), &fifo->userd.bar1);
 	nvkm_memory_unref(&fifo->userd.mem);
@@ -390,6 +392,7 @@ nvkm_fifo_new_(const struct nvkm_fifo_func *func, struct nvkm_device *device,
 	fifo->timeout.chan_msec = 10000;
 	spin_lock_init(&fifo->lock);
 	mutex_init(&fifo->mutex);
+	nv04_fifo_wedge_init(fifo);
 
 	return nvkm_engine_ctor(&nvkm_fifo, device, type, inst, true, &fifo->engine);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/nv04.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/nv04.c
index fa13cd55b593..cb81941ecccd 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/nv04.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/nv04.c
@@ -345,6 +345,8 @@ nv04_fifo_intr_cache_error(struct nvkm_fifo *fifo, u32 chid, u32 get)
 				   chid, chan ? chan->name : "unknown",
 				   (mthd >> 13) & 7, mthd & 0x1ffc, data);
 			nvkm_chan_put(&chan, flags);
+			nv04_fifo_recover(fifo, chid, NV04_FAULT_CACHE_ERROR,
+					  ((u64)mthd << 32) | data);
 		}
 	}
 
@@ -410,6 +412,8 @@ nv04_fifo_intr_dma_pusher(struct nvkm_fifo *fifo, u32 chid)
 	}
 	nvkm_chan_put(&chan, flags);
 
+	nv04_fifo_recover(fifo, chid, NV04_FAULT_DMA_PUSHER, state);
+
 	nvkm_wr32(device, 0x003228, 0x00000000);
 	nvkm_wr32(device, 0x003220, 0x00000001);
 	nvkm_wr32(device, 0x002100, NV_PFIFO_INTR_DMA_PUSHER);
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
index fff1428ef267..bf551906dcd4 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
@@ -83,6 +83,16 @@ void nv04_chan_start(struct nvkm_chan *);
 void nv04_chan_stop(struct nvkm_chan *);
 void nv04_eobj_ramht_del(struct nvkm_chan *, int);
 
+/* Recovery helper for Tesla cache_error/dma_pusher (recover.c). */
+#define NV04_FAULT_CACHE_ERROR	0
+#define NV04_FAULT_DMA_PUSHER	1
+
+void nv04_fifo_recover(struct nvkm_fifo *fifo, u32 chid, u32 fault_type, u64 info);
+void nv04_fifo_wedge_init(struct nvkm_fifo *fifo);
+void nv04_fifo_wedge_fini(struct nvkm_fifo *fifo);
+extern unsigned int nouveau_fifo_wedge_count;
+extern unsigned int nouveau_fifo_wedge_window_ms;
+
 int nv10_fifo_chid_nr(struct nvkm_fifo *);
 
 int nv50_fifo_chid_nr(struct nvkm_fifo *);
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/recover.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/recover.c
new file mode 100644
index 000000000000..14c0eebf7040
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/recover.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: MIT
+/*
+ * nv04_fifo_recover - shared recovery helper for Tesla cache_error and
+ * dma_pusher fault paths.
+ *
+ * Tier-1: kill the offending channel via nvkm_chan_error.
+ * Tier-2: after a configurable burst of faults within a sliding time
+ *         window, request a device-wide drm_dev_wedged_event so userspace
+ *         can rebind the driver.
+ */
+
+#include "priv.h"
+#include "chan.h"
+
+#include <core/device.h>
+#include <subdev/timer.h>
+
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
+#include <linux/ktime.h>
+#include <drm/drm_drv.h>
+#include <drm/drm_device.h>
+
+#include "nouveau_drv.h"
+#include <trace/events/nouveau_fifo.h>
+
+static struct drm_device *
+nv04_fifo_drm_device(struct nvkm_fifo *fifo)
+{
+	struct nvkm_device *device = fifo->engine.subdev.device;
+	struct nouveau_drm *drm = dev_get_drvdata(device->dev);
+
+	return (drm && drm->dev) ? drm->dev : NULL;
+}
+
+void
+nv04_fifo_recover(struct nvkm_fifo *fifo, u32 chid, u32 fault_type, u64 info)
+{
+	struct drm_device *drm_dev = nv04_fifo_drm_device(fifo);
+	struct nvkm_chan *chan;
+	unsigned long flags;
+	ktime_t now, cutoff;
+	u32 i, count;
+
+	if (drm_dev)
+		trace_nouveau_fifo_chan_killed(drm_dev, chid, fault_type, info);
+
+	chan = nvkm_chan_get_chid(&fifo->engine, chid, &flags);
+	if (chan) {
+		nvkm_chan_error(chan, true);
+		nvkm_chan_put(&chan, flags);
+	}
+
+	if (nouveau_fifo_wedge_count == 0)
+		return;
+
+	now = ktime_get();
+	cutoff = ktime_sub_ms(now, nouveau_fifo_wedge_window_ms);
+
+	spin_lock_irqsave(&fifo->wedge.lock, flags);
+
+	/* Insert current first, then purge expired and count survivors. */
+	fifo->wedge.ts[fifo->wedge.head] = now;
+	fifo->wedge.head = (fifo->wedge.head + 1) % NVKM_FIFO_WEDGE_RING_MAX;
+
+	count = 0;
+	for (i = 0; i < NVKM_FIFO_WEDGE_RING_MAX; i++) {
+		if (!ktime_to_ns(fifo->wedge.ts[i]))
+			continue;
+		if (ktime_before(fifo->wedge.ts[i], cutoff))
+			fifo->wedge.ts[i] = 0;
+		else
+			count++;
+	}
+	fifo->wedge.count = count;
+
+	if (count >= nouveau_fifo_wedge_count)
+		schedule_work(&fifo->wedge.work);
+
+	spin_unlock_irqrestore(&fifo->wedge.lock, flags);
+}
+
+static void
+nv04_fifo_wedge_work(struct work_struct *work)
+{
+	struct nvkm_fifo_wedge *w = container_of(work, struct nvkm_fifo_wedge, work);
+	struct nvkm_fifo *fifo = container_of(w, struct nvkm_fifo, wedge);
+	struct drm_device *drm_dev = nv04_fifo_drm_device(fifo);
+	u32 fault_count;
+
+	if (atomic_xchg(&w->wedged, 1) != 0)
+		return; /* already wedged this cycle */
+
+	if (!drm_dev)
+		return;
+
+	fault_count = w->count;
+
+	dev_info(drm_dev->dev,
+		 "nouveau: fifo wedged after %u faults in %u ms\n",
+		 fault_count, nouveau_fifo_wedge_window_ms);
+
+	trace_nouveau_fifo_dev_wedged(drm_dev, fault_count,
+				      nouveau_fifo_wedge_window_ms);
+
+	drm_dev_wedged_event(drm_dev, DRM_WEDGE_RECOVERY_REBIND, NULL);
+}
+
+void
+nv04_fifo_wedge_init(struct nvkm_fifo *fifo)
+{
+	spin_lock_init(&fifo->wedge.lock);
+	INIT_WORK(&fifo->wedge.work, nv04_fifo_wedge_work);
+	atomic_set(&fifo->wedge.wedged, 0);
+}
+
+void
+nv04_fifo_wedge_fini(struct nvkm_fifo *fifo)
+{
+	cancel_work_sync(&fifo->wedge.work);
+}
-- 
2.53.0


  parent reply	other threads:[~2026-05-13 17:50 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-13 17:50 [PATCH 0/2] drm/nouveau: nv04 FIFO cleanup + recovery for Tesla Marek Czernohous
2026-05-13 17:50 ` [PATCH 1/2] drm/nouveau/fifo/nv04: filter benign CACHE_ERROR from Mesa NV50 bind probe Marek Czernohous
2026-05-16  1:34   ` Claude review: " Claude Code Review Bot
2026-05-13 17:50 ` Marek Czernohous [this message]
2026-05-16  1:34   ` Claude review: drm/nouveau/fifo: add recovery path for Tesla cache_error/dma_pusher Claude Code Review Bot
2026-05-16  1:34 ` Claude review: drm/nouveau: nv04 FIFO cleanup + recovery for Tesla Claude Code Review Bot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260513175014.96599-3-marek@czernohous.de \
    --to=mczernohous@gmail.com \
    --cc=dakr@kernel.org \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lyude@redhat.com \
    --cc=marek@czernohous.de \
    --cc=nouveau@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox