From 9ff6a8a1e1ae94f1cc0f4dc56f0dd11ac60d7f7b Mon Sep 17 00:00:00 2001 From: Anna Maniscalco Date: Tue, 10 Feb 2026 17:29:42 +0100 Subject: [PATCH 1/6] FROMLIST: drm/msm: always recover the gpu Previously, in case there was no more work to do, recover worker wouldn't trigger recovery and would instead rely on the gpu going to sleep and then resuming when more work is submitted. Recover_worker will first increment the fence of the hung ring so, if there's only one job submitted to a ring and that causes an hang, it will early out. There's no guarantee that the gpu will suspend and resume before more work is submitted and if the gpu is in a hung state it will stay in that state and probably trigger a timeout again. Just stop checking and always recover the gpu. Signed-off-by: Anna Maniscalco Link: https://lore.kernel.org/linux-arm-msm/20260210-recovery_suspend_fix-v1-1-00ed9013da04@gmail.com/ Message-ID: <20260210-recovery_suspend_fix-v1-1-00ed9013da04@gmail.com> Signed-off-by: Rob Clark Signed-off-by: Veeresh Bagale (cherry picked from commit 01a0d6cd7032e9993feea19fadb03ef9d5b488f2) --- drivers/gpu/drm/msm/msm_gpu.c | 42 +++++++++++++++++------------------ 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 995549d0bbbc5..ea3e79670f75e 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -547,32 +547,30 @@ static void recover_worker(struct kthread_work *work) msm_update_fence(ring->fctx, fence); } - if (msm_gpu_active(gpu)) { - /* retire completed submits, plus the one that hung: */ - retire_submits(gpu); + /* retire completed submits, plus the one that hung: */ + retire_submits(gpu); - gpu->funcs->recover(gpu); + gpu->funcs->recover(gpu); - /* - * Replay all remaining submits starting with highest priority - * ring - */ - for (i = 0; i < gpu->nr_rings; i++) { - struct msm_ringbuffer *ring = gpu->rb[i]; - unsigned long flags; + /* + * Replay all remaining submits starting with highest priority + * ring + */ + for (i = 0; i < gpu->nr_rings; i++) { + struct msm_ringbuffer *ring = gpu->rb[i]; + unsigned long flags; - spin_lock_irqsave(&ring->submit_lock, flags); - list_for_each_entry(submit, &ring->submits, node) { - /* - * If the submit uses an unusable vm make sure - * we don't actually run it - */ - if (to_msm_vm(submit->vm)->unusable) - submit->nr_cmds = 0; - gpu->funcs->submit(gpu, submit); - } - spin_unlock_irqrestore(&ring->submit_lock, flags); + spin_lock_irqsave(&ring->submit_lock, flags); + list_for_each_entry(submit, &ring->submits, node) { + /* + * If the submit uses an unusable vm make sure + * we don't actually run it + */ + if (to_msm_vm(submit->vm)->unusable) + submit->nr_cmds = 0; + gpu->funcs->submit(gpu, submit); } + spin_unlock_irqrestore(&ring->submit_lock, flags); } pm_runtime_put(&gpu->pdev->dev); From bc5f4d8252ea449eabecd083b719ae2599fbe68f Mon Sep 17 00:00:00 2001 From: Jie Zhang Date: Fri, 5 Jun 2026 01:38:18 +0530 Subject: [PATCH 2/6] FROMLIST: drm/msm: Recover HW before retire hung submit During recovery, it is not safe to retire the hung submit before we recover the GPU. Retiring the submit triggers BO free and that can result in GPU pagefaults since the GPU may be actively accessing those BOs. To fix this, retire the submits after gpu recovery is complete in recover_worker(). Fixes: 1a370be9ac51 ("drm/msm: restart queued submits after hang") Signed-off-by: Veeresh Bagale Signed-off-by: Jie Zhang Signed-off-by: Akhil P Oommen Link: https://lore.kernel.org/linux-arm-msm/20260605-assorted-fixes-june-v1-2-2caa04f7287c@oss.qualcomm.com --- drivers/gpu/drm/msm/msm_gpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index ea3e79670f75e..66e2820ae3583 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -547,11 +547,11 @@ static void recover_worker(struct kthread_work *work) msm_update_fence(ring->fctx, fence); } + gpu->funcs->recover(gpu); + /* retire completed submits, plus the one that hung: */ retire_submits(gpu); - gpu->funcs->recover(gpu); - /* * Replay all remaining submits starting with highest priority * ring From 771f93711c6e447dde47377ad46a2c81c47cf8ae Mon Sep 17 00:00:00 2001 From: Jie Zhang Date: Fri, 5 Jun 2026 01:38:19 +0530 Subject: [PATCH 3/6] FROMLIST: drm/msm/a6xx: Fix A663 GPUCC register list for state capture The GPUCC register list for A663 is incorrect, which can cause out-of-bounds register access during GPU state capture. Update it to use the correct register ranges. Fixes: 5773cce8615c ("drm/msm/a6xx: Add support for A663") Signed-off-by: Veeresh Bagale Signed-off-by: Jie Zhang Signed-off-by: Akhil P Oommen Link: https://lore.kernel.org/linux-arm-msm/20260605-assorted-fixes-june-v1-3-2caa04f7287c@oss.qualcomm.com --- drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c index d2d6b2fd3cba3..1daedb9ebfb9b 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c @@ -1244,7 +1244,9 @@ static void a6xx_get_gmu_registers(struct msm_gpu *gpu, _a6xx_get_gmu_registers(gpu, a6xx_state, &a6xx_gmu_reglist[1], &a6xx_state->gmu_registers[1], true); - if (adreno_is_a621(adreno_gpu) || adreno_is_a623(adreno_gpu)) + if (adreno_is_a621(adreno_gpu) || + adreno_is_a623(adreno_gpu) || + adreno_is_a663(adreno_gpu)) _a6xx_get_gmu_registers(gpu, a6xx_state, &a621_gpucc_reg, &a6xx_state->gmu_registers[2], false); else From 97dd5d9188577904bcab2f30add4077ea5c95992 Mon Sep 17 00:00:00 2001 From: Jie Zhang Date: Fri, 5 Jun 2026 01:38:20 +0530 Subject: [PATCH 4/6] FROMLIST: drm/msm/a6xx: Fix A621 GPUCC register list for state capture A621 uses an incorrect GPUCC register list during state capture. The existing list matches A623/A663. Rename it accordingly and add a dedicated A621 GPUCC register list. Fixes: 11cdb81b3c1b ("drm/msm/a6xx: Fix gpucc register block for A621") Signed-off-by: Veeresh Bagale Signed-off-by: Jie Zhang Signed-off-by: Akhil P Oommen Link: https://lore.kernel.org/linux-arm-msm/20260605-assorted-fixes-june-v1-4-2caa04f7287c@oss.qualcomm.com --- drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 7 ++++--- drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h | 12 ++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c index 1daedb9ebfb9b..a3e5b02a800b7 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c @@ -1244,11 +1244,12 @@ static void a6xx_get_gmu_registers(struct msm_gpu *gpu, _a6xx_get_gmu_registers(gpu, a6xx_state, &a6xx_gmu_reglist[1], &a6xx_state->gmu_registers[1], true); - if (adreno_is_a621(adreno_gpu) || - adreno_is_a623(adreno_gpu) || - adreno_is_a663(adreno_gpu)) + if (adreno_is_a621(adreno_gpu)) _a6xx_get_gmu_registers(gpu, a6xx_state, &a621_gpucc_reg, &a6xx_state->gmu_registers[2], false); + else if (adreno_is_a623(adreno_gpu) || adreno_is_a663(adreno_gpu)) + _a6xx_get_gmu_registers(gpu, a6xx_state, &a623_gpucc_reg, + &a6xx_state->gmu_registers[2], false); else _a6xx_get_gmu_registers(gpu, a6xx_state, &a6xx_gpucc_reg, &a6xx_state->gmu_registers[2], false); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h index 4753b71837f33..e6fcae8d4bd34 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h @@ -377,6 +377,17 @@ static const u32 a6xx_gmu_gpucc_registers[] = { }; static const u32 a621_gmu_gpucc_registers[] = { + /* GPU CC */ + 0x24000, 0x2400e, 0x24400, 0x2440e, 0x24800, 0x24805, 0x24c00, 0x24cff, + 0x25800, 0x25804, 0x25c00, 0x25c04, 0x26000, 0x26004, 0x26400, 0x26405, + 0x26414, 0x2641d, 0x2642a, 0x26430, 0x26432, 0x26432, 0x26441, 0x26455, + 0x26466, 0x26468, 0x26478, 0x2647a, 0x26489, 0x2648a, 0x2649c, 0x2649e, + 0x264a0, 0x264a3, 0x264b3, 0x264b5, 0x264c5, 0x264c7, 0x264d6, 0x264d8, + 0x264e8, 0x264e9, 0x264f9, 0x264fc, 0x2650b, 0x2650c, 0x2651c, 0x2651e, + 0x26540, 0x26570, 0x26600, 0x26616, 0x26620, 0x2662d, +}; + +static const u32 a623_gmu_gpucc_registers[] = { /* GPU CC */ 0x24000, 0x2400e, 0x24400, 0x2440e, 0x25800, 0x25804, 0x25c00, 0x25c04, 0x26000, 0x26004, 0x26400, 0x26405, 0x26414, 0x2641d, 0x2642a, 0x26430, @@ -402,6 +413,7 @@ static const struct a6xx_registers a6xx_gmu_reglist[] = { static const struct a6xx_registers a6xx_gpucc_reg = REGS(a6xx_gmu_gpucc_registers, 0, 0); static const struct a6xx_registers a621_gpucc_reg = REGS(a621_gmu_gpucc_registers, 0, 0); +static const struct a6xx_registers a623_gpucc_reg = REGS(a623_gmu_gpucc_registers, 0, 0); static u32 a6xx_get_cp_roq_size(struct msm_gpu *gpu); static u32 a7xx_get_cp_roq_size(struct msm_gpu *gpu); From 158d54108f92b12b86b8102c8ab79c519a9606d3 Mon Sep 17 00:00:00 2001 From: Jie Zhang Date: Fri, 5 Jun 2026 01:38:21 +0530 Subject: [PATCH 5/6] FROMLIST: drm/msm/a6xx: Fix IRQ storm during msm_recovery test Once a hang is triggered by the msm_recovery test, the gpu error irq remains asserted and triggers an interrupt storm. In the worst case, this IRQ storm lands on the CPU core where the hangcheck timer is scheduled, blocking it from running. This eventually leads to CPU watchdog timeouts. To fix this, mask the gpu error irqs during msm_recovery test and enable them back during the recovery. Fixes: 5edf2750d998 ("drm/msm: Add debugfs to disable hw err handling") Signed-off-by: Veeresh Bagale Signed-off-by: Jie Zhang Signed-off-by: Akhil P Oommen Link: https://lore.kernel.org/linux-arm-msm/20260605-assorted-fixes-june-v1-5-2caa04f7287c@oss.qualcomm.com --- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 5 +++++ drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 5 ++++- drivers/gpu/drm/msm/msm_gpu.c | 2 ++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 4a04dc43a8e67..0c9700cd33516 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -1281,6 +1281,11 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu) status & ~A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR); if (priv->disable_err_irq) { + /* Turn off interrupts to avoid interrupt storm */ + gpu_write(gpu, REG_A5XX_RBBM_INT_0_MASK, + A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS | + A5XX_RBBM_INT_0_MASK_CP_SW); + status &= A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS | A5XX_RBBM_INT_0_MASK_CP_SW; } diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 881c5fe64ea01..5822c4d2c6eac 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1915,8 +1915,11 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu) gpu_write(gpu, REG_A6XX_RBBM_INT_CLEAR_CMD, status); - if (priv->disable_err_irq) + if (priv->disable_err_irq) { + /* Turn off interrupts to avoid interrupt storm */ + gpu_write(gpu, REG_A6XX_RBBM_INT_0_MASK, A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS); status &= A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS; + } if (status & A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT) a6xx_fault_detect_irq(gpu); diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 66e2820ae3583..fbb1dd9830953 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -547,6 +547,8 @@ static void recover_worker(struct kthread_work *work) msm_update_fence(ring->fctx, fence); } + priv->disable_err_irq = false; + gpu->funcs->recover(gpu); /* retire completed submits, plus the one that hung: */ From 9c2fc0c2069bd918e890987560314bd39efffb8c Mon Sep 17 00:00:00 2001 From: Jie Zhang Date: Fri, 5 Jun 2026 01:38:22 +0530 Subject: [PATCH 6/6] FROMLIST: drm/msm: Fix task_struct reference leak in recover_worker get_pid_task() increments the task reference count, but the corresponding put_task_struct() was missing in the else branch, leaking a reference on every GPU hang recovery. Fixes: 25654a1756a4 ("drm/msm: Update global fault counter when faulty process has already ended") Signed-off-by: Veeresh Bagale Signed-off-by: Jie Zhang Signed-off-by: Akhil P Oommen Link: https://lore.kernel.org/linux-arm-msm/20260605-assorted-fixes-june-v1-6-2caa04f7287c@oss.qualcomm.com Signed-off-by: Veeresh Bagale --- drivers/gpu/drm/msm/msm_gpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index fbb1dd9830953..86abff96bc111 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -504,6 +504,8 @@ static void recover_worker(struct kthread_work *work) */ if (!vm->managed) msm_gem_vm_unusable(submit->vm); + + put_task_struct(task); } get_comm_cmdline(submit, &comm, &cmd);