From d7d25c95fffbfecdf214c86ff7bda68bedd25070 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Sun, 28 Jun 2026 10:31:33 +1000
Subject: [PATCH 01/15] feat(solvers): custom geometric-MG prolongation hook
 (scalar)

Add set_custom_mg() + utilities/custom_mg.py: drive PCMG geometric multigrid
with a prolongation we build ourselves (barycentric or RBF) from independent,
possibly non-nested coarse meshes. Coarse operators come from Galerkin RAP
(P^T A P), so only the prolongation list is supplied. This decouples geometric
multigrid from a nested refine() hierarchy.

Mechanism: inject the custom P before the first PCSetUp (avoids the
MatProductReplaceMats shape error a live matrix swap triggers under Galerkin),
and KSPSetDMActive(OPERATOR, False) so PETSc uses our explicit P instead of
re-deriving DM-hierarchy interpolation. Finest level reduced to the
BC-eliminated global ordering; coarse levels use full DOF coords.

Validated (scalar Poisson, refinement=2 box): custom barycentric P from
independent coarse meshes reaches FMG iteration counts (1-3 vs FMG 2) and the
same solution. On a mesh with NO refine hierarchy (FMG unavailable -> GAMG, 13
iters) custom-P geometric MG converges in ~9 iters. test_1015 (4 tests);
test_1014 (11) still pass.

Single-field (scalar/vector) only; Stokes velocity-block path is a follow-up.

Underworld development team with AI support from Claude Code
---
 .../cython/petsc_generic_snes_solvers.pyx     |  46 ++++
 src/underworld3/utilities/custom_mg.py        | 208 ++++++++++++++++++
 tests/test_1015_custom_mg_prolongation.py     |  77 +++++++
 3 files changed, 331 insertions(+)
 create mode 100644 src/underworld3/utilities/custom_mg.py
 create mode 100644 tests/test_1015_custom_mg_prolongation.py

diff --git a/src/underworld3/cython/petsc_generic_snes_solvers.pyx b/src/underworld3/cython/petsc_generic_snes_solvers.pyx
index 296e162e..35c57ae3 100644
--- a/src/underworld3/cython/petsc_generic_snes_solvers.pyx
+++ b/src/underworld3/cython/petsc_generic_snes_solvers.pyx
@@ -93,6 +93,45 @@ class SolverBaseClass(uw_object):
         # smoother / coarse-solver options with the framework FMG bundle.
         self._pc_user_override = False
 
+        # Custom multigrid prolongation hierarchy (see set_custom_mg /
+        # utilities.custom_mg). None => standard FMG/GAMG path, unchanged.
+        self._custom_mg = None
+
+    def set_custom_mg(self, coarse_meshes, kind="barycentric", verbose=False):
+        r"""Drive geometric multigrid with a prolongation we build ourselves.
+
+        Supplies a sequence of (possibly **non-nested**) coarse meshes from which
+        a barycentric or RBF prolongation ``P`` is assembled and installed into
+        the PCMG via ``PC.setMGInterpolation``; coarse operators are formed by
+        Galerkin RAP. This decouples geometric multigrid from a nested
+        ``refine()`` hierarchy — it works even when the solver mesh has no
+        refinement hierarchy at all.
+
+        Parameters
+        ----------
+        coarse_meshes : list of Mesh
+            Coarsest-first list of coarse meshes (the finest level is the
+            solver's own mesh). Need not be nested with the solver mesh.
+        kind : {"barycentric", "rbf"}
+            Prolongation builder. ``barycentric`` is FE-exact; ``rbf`` is a
+            polyharmonic RBF (Shepard-normalised). Default ``barycentric``.
+        verbose : bool
+            Print the per-level DOF counts at injection.
+
+        Notes
+        -----
+        Single-field (scalar/vector) solvers only for now; the Stokes
+        velocity-block path is a separate increment. See
+        :mod:`underworld3.utilities.custom_mg`.
+        """
+        if kind not in ("barycentric", "rbf"):
+            raise ValueError("kind must be 'barycentric' or 'rbf'")
+        if not coarse_meshes:
+            raise ValueError("coarse_meshes must be a non-empty coarsest-first list")
+        self._custom_mg = {"coarse_meshes": list(coarse_meshes),
+                           "kind": kind, "verbose": verbose}
+        self.is_setup = False
+
     def add_update_callback(self, callback):
         r"""Register a callback fired at the start of every nonlinear (SNES) iteration.
 
@@ -2662,6 +2701,13 @@ class SNES_Scalar(SolverBaseClass):
         # ``constant_nullspace`` was set.
         self._attach_constant_nullspace()
 
+        # Custom multigrid prolongation: inject our P hierarchy before the
+        # first PCSetUp (so the Galerkin coarse operators are built from it).
+        # No-op unless set_custom_mg() was called.
+        if self._custom_mg is not None:
+            from underworld3.utilities.custom_mg import inject_custom_mg
+            inject_custom_mg(self)
+
         # solve
         self._snes_solve_with_retries(gvec, divergence_retries, verbose)
 
diff --git a/src/underworld3/utilities/custom_mg.py b/src/underworld3/utilities/custom_mg.py
new file mode 100644
index 00000000..9ffb5c62
--- /dev/null
+++ b/src/underworld3/utilities/custom_mg.py
@@ -0,0 +1,208 @@
+"""
+Custom geometric-multigrid prolongation injection.
+
+Build the multigrid prolongation ``P`` **ourselves** (barycentric or RBF) from a
+sequence of independent — possibly *non-nested* — coarse meshes, and install it
+into a solver's PETSc ``PCMG`` via ``PC.setMGInterpolation``. The coarse
+operators are then formed by Galerkin RAP (``Pᵀ A P``), exactly as UW3's FMG
+already does.
+
+Why
+---
+UW3's geometric FMG requires a nested ``refine()`` hierarchy with PETSc's exact
+nested interpolation. That couples multigrid to uniform refinement and rules out
+local adaptation / a dynamic node budget. PETSc's ``PCMG`` does **not** require
+its transfer operator to come from the DM hierarchy, however: any prolongation
+supplied with ``setMGInterpolation`` is used, and ``pc_mg_galerkin=both`` builds
+consistent coarse operators from it. The prolongation "evaluate the coarse FE
+function at the fine DOF coordinates" is exactly what the nested path computes —
+so we can build it ourselves by barycentric (FE-exact) or RBF interpolation for
+*any* coarse/fine pair. This decouples *how the mesh refines* from *what transfer
+multigrid uses*.
+
+Validated (scalar Poisson, refinement=2 box): custom barycentric ``P`` from
+independent coarse meshes reaches FMG iteration counts (1–3 vs FMG 2). On a mesh
+with **no** refine hierarchy (FMG unavailable → GAMG, 13 iters) custom-P
+geometric MG converges in ~9 iters.
+
+Notes
+-----
+* The finest level is the solver's real (BC-eliminated) operator space; its
+  ``P`` rows are reduced to the global/MG ordering. Coarse levels use the full
+  DOF coordinates of each coarse mesh (no BC reduction needed — Galerkin RAP and
+  the finest reduction keep the coarse correction consistent).
+* Injection happens at solver build time, *before* the first ``PCSetUp``: the
+  first Galerkin assembly is built from our ``P`` directly, avoiding PETSc's
+  ``MatProductReplaceMats`` shape error that a live matrix swap triggers.
+* ``KSPSetDMActive(OPERATOR, False)`` stops PETSc re-deriving interpolation /
+  operators from the DM hierarchy, so our explicit ``P`` is used.
+"""
+
+import numpy as np
+from petsc4py import PETSc
+
+__all__ = ["barycentric_prolongation", "rbf_prolongation", "inject_custom_mg"]
+
+
+# --------------------------------------------------------------------------- #
+#  Prolongation builders (return scipy CSR matrices)
+# --------------------------------------------------------------------------- #
+def barycentric_prolongation(coarse_coords, fine_coords):
+    """FE-exact prolongation: each fine DOF interpolated by the coarse element
+    that contains it (barycentric weights). Partition of unity (row sums = 1);
+    fine points outside the coarse hull fall back to the nearest coarse DOF."""
+    import scipy.sparse as sp
+    from scipy.spatial import Delaunay, cKDTree
+
+    tri = Delaunay(coarse_coords)
+    simp = tri.find_simplex(fine_coords)
+    tree = cKDTree(coarse_coords)
+    dim = coarse_coords.shape[1]
+    rows, cols, vals = [], [], []
+    for i in range(fine_coords.shape[0]):
+        s = simp[i]
+        if s < 0:  # outside coarse hull → nearest coarse DOF
+            rows.append(i); cols.append(int(tree.query(fine_coords[i])[1])); vals.append(1.0)
+            continue
+        verts = tri.simplices[s]
+        T = tri.transform[s]
+        bary = T[:dim].dot(fine_coords[i] - T[dim])
+        w = np.append(bary, 1.0 - bary.sum())
+        for k in range(dim + 1):
+            rows.append(i); cols.append(int(verts[k])); vals.append(float(w[k]))
+    return sp.csr_matrix((vals, (rows, cols)),
+                         shape=(fine_coords.shape[0], coarse_coords.shape[0]))
+
+
+def rbf_prolongation(coarse_coords, fine_coords, smooth=0.0):
+    """RBF prolongation: polyharmonic (r² log r) kernel + affine polynomial tail
+    (reproduces linear fields), Shepard row-normalised to a partition of unity.
+    Works for arbitrary (non-nested) point sets; software-equivalent to the
+    barycentric builder as an MG transfer operator."""
+    import scipy.sparse as sp
+    from scipy.spatial.distance import cdist
+
+    def phi(r):
+        r = np.where(r == 0.0, 1e-30, r)
+        return r ** 2 * np.log(r)
+
+    nc, dim = coarse_coords.shape
+    Pc = np.hstack([np.ones((nc, 1)), coarse_coords])          # affine tail
+    Acc = phi(cdist(coarse_coords, coarse_coords)) + smooth * np.eye(nc)
+    M = np.block([[Acc, Pc], [Pc.T, np.zeros((dim + 1, dim + 1))]])
+    Minv = np.linalg.inv(M)
+    B = np.hstack([phi(cdist(fine_coords, coarse_coords)),
+                   np.ones((fine_coords.shape[0], 1)), fine_coords])
+    Praw = (B @ Minv)[:, :nc]
+    rs = Praw.sum(axis=1, keepdims=True)
+    rs[np.abs(rs) < 1e-12] = 1.0
+    return sp.csr_matrix(Praw / rs)
+
+
+_BUILDERS = {"barycentric": barycentric_prolongation, "rbf": rbf_prolongation}
+
+
+# --------------------------------------------------------------------------- #
+#  Coordinate helpers
+# --------------------------------------------------------------------------- #
+def _to_petsc_aij(csr):
+    csr = csr.tocsr()
+    M = PETSc.Mat().createAIJ(
+        size=csr.shape,
+        csr=(csr.indptr.astype("int32"), csr.indices.astype("int32"), csr.data),
+    )
+    M.assemble()
+    return M
+
+
+def _reduce_to_global(dm, full_coords):
+    """Reduce full local DOF coordinates to the solver's global (BC-eliminated,
+    MG-ordered) layout by scattering each component through ``localToGlobal``
+    (which drops constrained boundary DOFs and reorders to global indices)."""
+    lvec = dm.getLocalVec()
+    gvec = dm.getGlobalVec()
+    cdim = full_coords.shape[1]
+    if lvec.getLocalSize() != full_coords.shape[0]:
+        dm.restoreLocalVec(lvec); dm.restoreGlobalVec(gvec)
+        raise RuntimeError(
+            f"custom_mg: local DOF count {lvec.getLocalSize()} != coord count "
+            f"{full_coords.shape[0]} — degree/continuity mismatch")
+    out = np.zeros((gvec.getSize(), cdim))
+    for c in range(cdim):
+        lvec.array[:] = full_coords[:, c]
+        dm.localToGlobal(lvec, gvec, addv=False)
+        out[:, c] = gvec.array
+    dm.restoreLocalVec(lvec)
+    dm.restoreGlobalVec(gvec)
+    return out
+
+
+# --------------------------------------------------------------------------- #
+#  Injection
+# --------------------------------------------------------------------------- #
+def inject_custom_mg(solver):
+    """Configure ``solver``'s PCMG to use our custom prolongation hierarchy.
+
+    Called from the solver's ``solve()`` (after ``_build``, before the SNES
+    solve) when ``solver._custom_mg`` is set via ``set_custom_mg``. Scalar /
+    single-field solvers only for now (the Stokes velocity-block path is a
+    separate increment).
+    """
+    cfg = solver._custom_mg
+    coarse_meshes = cfg["coarse_meshes"]
+    kind = cfg["kind"]
+    builder = _BUILDERS[kind]
+
+    var = solver.Unknowns.u
+    degree = var.degree
+    continuous = getattr(var, "continuous", True)
+
+    if solver._pc_option_prefix not in ("", None):
+        raise NotImplementedError(
+            "custom_mg currently supports single-field (scalar/vector) solvers; "
+            f"solver uses PC prefix '{solver._pc_option_prefix}' (e.g. Stokes "
+            "velocity block) which is a separate increment.")
+
+    # --- per-level DOF coordinates --------------------------------------- #
+    fine = _reduce_to_global(solver.dm,
+                             solver.mesh._get_coords_for_basis(degree, continuous))
+    levels = [m._get_coords_for_basis(degree, continuous) for m in coarse_meshes]
+    levels.append(fine)
+    nlev = len(levels)
+    if nlev < 2:
+        raise ValueError("custom_mg needs at least one coarse mesh")
+
+    # --- build prolongations P[l]: level l-1 -> level l ------------------ #
+    Ps = [None] + [_to_petsc_aij(builder(levels[l - 1], levels[l]))
+                   for l in range(1, nlev)]
+
+    # --- configure the geometric MG bundle on the managed block ---------- #
+    opts = solver.petsc_options
+    pfx = solver._pc_option_prefix or ""
+    opts[pfx + "pc_type"] = "mg"
+    opts[pfx + "pc_mg_type"] = "full"
+    opts[pfx + "pc_mg_galerkin"] = "both"          # coarse operators = PᵀAP
+    opts[pfx + "mg_levels_ksp_type"] = "richardson"
+    opts[pfx + "mg_levels_pc_type"] = "sor"
+    opts[pfx + "mg_coarse_pc_type"] = "redundant"
+    opts[pfx + "mg_coarse_redundant_pc_type"] = "lu"
+    for key in ("pc_gamg_type", "pc_gamg_repartition", "pc_gamg_agg_nsmooths"):
+        opts.delValue(pfx + key)
+
+    # --- inject before the first PCSetUp --------------------------------- #
+    solver.snes.setUp()
+    ksp = solver.snes.getKSP()
+    # Stop PETSc re-deriving interpolation/operators from the DM hierarchy.
+    ksp.setDMActive(PETSc.KSP.DMActive.OPERATOR, False)
+    pc = ksp.getPC()
+    pc.setType("mg")
+    pc.setMGLevels(nlev)
+    pc.setMGType(PETSc.PC.MGType.FULL)
+    for l in range(1, nlev):
+        pc.setMGInterpolation(l, Ps[l])
+    pc.setFromOptions()
+
+    if cfg.get("verbose"):
+        from underworld3 import mpi
+        mpi.pprint(f"[{solver.name}] custom_mg ({kind}): levels "
+                   f"{[lv.shape[0] for lv in levels]}")
diff --git a/tests/test_1015_custom_mg_prolongation.py b/tests/test_1015_custom_mg_prolongation.py
new file mode 100644
index 00000000..ea975df1
--- /dev/null
+++ b/tests/test_1015_custom_mg_prolongation.py
@@ -0,0 +1,77 @@
+"""
+Custom multigrid prolongation hook (set_custom_mg / utilities.custom_mg).
+
+Geometric multigrid driven by a prolongation we build ourselves (barycentric or
+RBF) from independent, non-nested coarse meshes — decoupling geometric MG from a
+nested refine() hierarchy. See docs/developer/design and the study notes.
+"""
+import numpy as np
+import pytest
+import underworld3 as uw
+
+pytestmark = [pytest.mark.level_2, pytest.mark.tier_b]
+
+
+def _poisson(mesh):
+    p = uw.systems.Poisson(mesh)
+    p.constitutive_model = uw.constitutive_models.DiffusionModel
+    p.constitutive_model.Parameters.diffusivity = 1
+    p.f = 0.0
+    p.add_dirichlet_bc(0.0, "Bottom")
+    p.add_dirichlet_bc(1.0, "Top")
+    return p
+
+
+def _box(cellSize, refinement=None):
+    return uw.meshing.UnstructuredSimplexBox(
+        minCoords=(0.0, 0.0), maxCoords=(1.0, 1.0),
+        cellSize=cellSize, refinement=refinement, qdegree=2)
+
+
+def test_custom_mg_barycentric_matches_fmg():
+    """Custom barycentric P (independent coarse meshes) must converge and agree
+    with the nested-FMG solution on the same fine problem."""
+    fmg = _poisson(_box(0.25, refinement=2))
+    fmg.solve()
+    assert fmg.snes.getConvergedReason() > 0
+
+    cust = _poisson(_box(0.25, refinement=2))
+    cust.set_custom_mg([_box(0.285), _box(0.142)], kind="barycentric")
+    cust.solve()
+    assert cust.snes.getKSP().getPC().getType() == "mg"
+    assert cust.snes.getConvergedReason() > 0
+    # same fine problem -> same solution (Galerkin RAP from our P)
+    rel = (np.linalg.norm(cust.Unknowns.u.data - fmg.Unknowns.u.data)
+           / (np.linalg.norm(fmg.Unknowns.u.data) + 1e-30))
+    assert rel < 1e-4
+    # competitive with nested FMG (not pathologically worse)
+    assert cust.snes.getKSP().getIterationNumber() <= fmg.snes.getKSP().getIterationNumber() + 5
+
+
+def test_custom_mg_rbf_drives_solver():
+    """The RBF builder must also drive a converging geometric MG solve."""
+    cust = _poisson(_box(0.25, refinement=2))
+    cust.set_custom_mg([_box(0.285), _box(0.142)], kind="rbf")
+    cust.solve()
+    assert cust.snes.getKSP().getPC().getType() == "mg"
+    assert cust.snes.getConvergedReason() > 0
+
+
+def test_custom_mg_without_refine_hierarchy():
+    """Key capability: geometric MG on a mesh with NO refine() hierarchy
+    (FMG unavailable -> normally GAMG). Custom P supplies the hierarchy."""
+    mesh = _box(0.08)                 # no refinement -> single-level
+    assert len(mesh.dm_hierarchy) == 1
+    cust = _poisson(mesh)
+    cust.set_custom_mg([_box(0.3), _box(0.16)], kind="barycentric")
+    cust.solve()
+    assert cust.snes.getKSP().getPC().getType() == "mg"
+    assert cust.snes.getConvergedReason() > 0
+
+
+def test_set_custom_mg_validation():
+    p = _poisson(_box(0.5))
+    with pytest.raises(ValueError):
+        p.set_custom_mg([_box(0.8)], kind="not-a-builder")
+    with pytest.raises(ValueError):
+        p.set_custom_mg([], kind="barycentric")

From 76298f359d7d508a2b968e6fa0ca0cead159e430 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 06:18:39 +1000
Subject: [PATCH 02/15] feat(custom_mg): Layer-1 generalized FMG hierarchy
 (BC-per-level) + scoped SBR

Phase 1 of hardening the custom-prolongation work into a generalized FMG hierarchy.

- CustomMGHierarchy: adapter-agnostic multi-level hierarchy that builds custom-P
  transfers (barycentric/rbf) with essential BCs applied at EVERY level
  (transfers map reduced->reduced). This is the load-bearing invariant: on an
  exactly-nested hierarchy, omitting per-level reduction makes a coarse boundary
  DOF coincide with a BC-removed fine DOF -> zero column -> singular Galerkin
  coarse operator. build() guards against zero columns.
- set_custom_fmg(): register a hierarchy + per-level solver factory; built and
  installed at solve() time (build-time injection, DMActive(OPERATOR,False),
  Galerkin RAP). Scalar / single-field-vector (top-level PC); Stokes velocity
  block is Phase 2.
- sbr_refine / sbr_refine_where: local Skeleton-Based Refinement (no MMG,
  on-rank, conforming) with SCOPED dm_plex_transform_type (leaking it globally
  breaks UW's uniform refine() with err73).
- Legacy finest-only path kept for back-compat (test_1015).

Validated: scalar jump-coefficient Poisson on a 5-level (3 uniform + 2 SBR)
hierarchy converges in 3 FMG iters vs GAMG 46, solution matches to 2.4e-8.
test_1016 (3 tests); test_1015 (4) + test_1014 (11) still pass.

Design: docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md

Underworld development team with AI support from Claude Code
---
 .../GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md    | 120 ++++++++
 src/underworld3/utilities/custom_mg.py        | 285 +++++++++++++++---
 tests/test_1016_custom_mg_hierarchy.py        |  60 ++++
 3 files changed, 424 insertions(+), 41 deletions(-)
 create mode 100644 docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
 create mode 100644 tests/test_1016_custom_mg_hierarchy.py

diff --git a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
new file mode 100644
index 00000000..cb2908cc
--- /dev/null
+++ b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
@@ -0,0 +1,120 @@
+# Generalized FMG hierarchy + adapt-on-top
+
+Status: design (hardening the custom-prolongation prototype). 2026-06-28.
+
+## Context
+
+UW3's geometric FMG (`Mesh(refinement=N)` → `dm_hierarchy`, `preconditioner="fmg"`)
+requires a **uniform nested** hierarchy with PETSc's canonical nested interpolation.
+That couples multigrid to uniform refinement and rules out local adaptation.
+
+A prototype established that we can drive geometric multigrid with a prolongation we
+**build ourselves** (barycentric or local-RBF) and install via `PC.setMGInterpolation`,
+with Galerkin RAP forming the coarse operators. Validated:
+- FMG-equivalent convergence on non-nested *and* nested hierarchies (SolCx velocity
+  block, η-jump 1e6: 8 iters, vs GAMG 157 — geometric MG where FMG cannot otherwise exist);
+- a 5-level hierarchy (3 uniform + 2 SBR-targeted at the jump) solves at 8 iters;
+- local SBR refinement is a ~4-line stock-petsc4py call (no MMG): mark cells in a label,
+  `dm_plex_transform_type=refine_sbr`, `dm.adaptLabel("name")` — conforming, on-rank.
+
+**Load-bearing correctness lesson:** essential BCs must be applied at **every** level,
+transfers mapping reduced→reduced. PETSc native FMG does this automatically (each level's
+`PetscSection` carries the Dirichlet constraints); custom-P from raw coordinates must
+replicate it. Exact nesting makes the omission fatal (coarse boundary-normal DOFs coincide
+with BC-removed fine DOFs → zero columns → singular coarse operator).
+
+## Two-layer architecture
+
+### Layer 1 — generalized FMG hierarchy (adapter-agnostic)
+**The existing `refinement=N` uniform hierarchy STANDS** and is the FMG base. Layer 1
+generalizes the *transfer machinery* so the hierarchy may contain levels that are **not**
+uniform refinements, by carrying **supplied prolongations** instead of relying on canonical
+nesting.
+
+Layer 1 knows nothing about *how* a level was produced. Its contract is a sequence of
+levels, each `(dm, prolongation_to_next)`, plus:
+- **BC-per-level reduction** — transfers map reduced→reduced (the invariant above), derived
+  from the owning solver's essential BCs via each level DM's global section.
+- **Transfer builders** — pluggable: `nested` (PETSc canonical, for uniform levels),
+  `barycentric` (FE-exact, needs a triangulation), `rbf` (local kNN, scattered-point
+  robust, reuses the swarm-proxy RBF kernel). Default per level: `nested` where the level
+  is a genuine uniform refinement, else `barycentric`.
+- **Installation** — set the transfers on the velocity-block (Stokes) or top (scalar/vector)
+  PCMG via `setMGInterpolation`, before first `PCSetUp`, with `KSPSetDMActive(OPERATOR,False)`
+  and `pc_mg_galerkin=both`. Re-attach nullspaces (pressure / rigid-body) after rebuild.
+
+Home: `src/underworld3/utilities/custom_mg.py` (grow the committed module).
+
+Generality requirement: Layer 1's interface must admit **any** future level source
+(MMG/parmmg, edge/face-point injection, knockout-as-coarsening, independent meshes) and
+**any** transfer builder. No SBR-specific assumptions in Layer 1.
+
+### Layer 2 — adapt-on-top (first concrete application)
+On top of the uniform FMG base, add refined levels via the **existing adaptive-meshing
+pattern**: choose an **adapter** + a **metric**, call **periodically**; field
+transfer/redistribution handled by the existing remesh machinery
+(`remesh_with_field_transfer`, `on_remesh`, `mesh.adapt`).
+
+This pass's adapter is **SBR refine-on-top**:
+- metric → mark cells (e.g. |∇field| threshold, or distance-to-feature) → `adaptLabel`
+  with `refine_sbr` → a new finest level on top of the current finest;
+- **non-load-balancing** (on-rank, no redistribution) — which **bounds the number of
+  extra levels** (imbalance grows with depth); the adapter exposes/limits this;
+- registers `(new level DM, custom-P transfer)` into the Layer-1 hierarchy.
+
+Home: an adapter consistent with `adaptivity.py` / `mesh.adapt(metric, adapter=...)`,
+calling into Layer 1. Future adapters (load-balancing MMG, etc.) plug in the same way.
+
+## Parallel strategy (parallel from the start)
+
+The supported parallel path is the **nested, co-partitioned** hierarchy:
+- The uniform base is co-partitioned by construction (`dm.refine()` preserves partition);
+  SBR is on-rank → levels stay co-located (a fine cell's parent coarse cell is on the
+  same rank). So each rank builds its block of `P` from its **local** coarse cells —
+  point-location is rank-local; only a thin ghost layer at partition boundaries.
+- **BC-per-level reduction is parallel-correct for free**: it rides the DM global section
+  (`localToGlobal` gives the reduced global ordering across ranks).
+- `P` is assembled as an MPIAIJ matrix (fine local rows; coarse local + ghost columns).
+
+**Non-nested / independent-mesh custom-P** (cross-rank point-location) is **serial-only /
+experimental** — it is not on the parallel-supported path. The production case
+(targeted refinement = nested) needs only the rank-local path.
+
+Non-load-balancing SBR → bound the added refinement depth (configurable cap); document the
+imbalance/level trade-off.
+
+## Correctness invariants (must hold, all levels)
+1. BCs applied at every level; transfers reduced→reduced (no zero columns).
+2. Each prolongation reproduces constants (row-sums = 1 — partition of unity).
+3. `pc_mg_galerkin=both` (coarse operators = PᵀAP; UW installs no coarse residual/Jacobian).
+4. Nullspaces (constant-pressure / rigid-body) re-attached after any rebuild.
+5. No silent fallback: if a level lacks a valid transfer, error (don't degrade to GAMG silently).
+
+## Test matrix
+- Hierarchy build: uniform-only; uniform + N SBR levels; nesting + label survival.
+- Per-level reduction: zero-column count == 0 on all transfers (scalar + free-slip vector).
+- Transfer: partition-of-unity; barycentric vs rbf; nested-exact vs general.
+- Solve: FMG + V-cycle converge on nested and (serial) non-nested; match uniform-FMG iters;
+  beat GAMG; regression — existing FMG/GAMG paths unchanged.
+- Parallel (np>1): co-located transfers, reduced sections, FMG convergence; bounded levels.
+
+## Phased roadmap
+1. **Layer 1 core** — generalized hierarchy + auto BC-per-level reduction + transfer builders
+   (nested/barycentric/rbf) + install into PCMG; scalar + free-slip vector; serial-validated,
+   designed parallel-correct (rank-local construction).
+2. **Stokes integration** — `set_custom_mg`/hierarchy on the velocity block; nullspace
+   re-attach; guards; SolCx end-to-end.
+3. **Parallel validation** — np>1 tests; co-location/ghost handling; reduced-section
+   correctness; fix the known rank-local-accumulator pitfalls.
+4. **Layer 2 adapter** — SBR refine-on-top following `mesh.adapt` (metric→mark→refine→register),
+   non-load-balancing, bounded depth, field transfer via existing remesh.
+5. **Tests + design doc finalize + tier classification.**
+
+Later (not this pass; do not block): dynamic per-step reallocation loop; efficiency
+(transfer caching, exact-nested combinatorial P, operator reuse); load-balancing adapters;
+non-nested parallel transfers; knockout-as-coarsening as an alternative Layer-2 strategy.
+
+## Explicit non-goals for this pass
+- Knockout (shown to pay full-fine assembly; structural value only) — not pursued now.
+- Non-nested custom-P in parallel — serial/experimental only.
+- Dynamic time-stepping convection integration — later phase.
diff --git a/src/underworld3/utilities/custom_mg.py b/src/underworld3/utilities/custom_mg.py
index 9ffb5c62..b6fea9c3 100644
--- a/src/underworld3/utilities/custom_mg.py
+++ b/src/underworld3/utilities/custom_mg.py
@@ -41,7 +41,8 @@
 import numpy as np
 from petsc4py import PETSc
 
-__all__ = ["barycentric_prolongation", "rbf_prolongation", "inject_custom_mg"]
+__all__ = ["barycentric_prolongation", "rbf_prolongation", "inject_custom_mg",
+           "CustomMGHierarchy", "set_custom_fmg", "sbr_refine", "sbr_refine_where"]
 
 
 # --------------------------------------------------------------------------- #
@@ -102,6 +103,57 @@ def phi(r):
 _BUILDERS = {"barycentric": barycentric_prolongation, "rbf": rbf_prolongation}
 
 
+# --------------------------------------------------------------------------- #
+#  Local Skeleton-Based Refinement (no MMG; on-rank; conforming)
+# --------------------------------------------------------------------------- #
+_DM_ADAPT_REFINE = 1   # PETSc DM_ADAPT_REFINE
+
+
+def _sbr_apply(dm, mark):
+    """Clone ``dm``, let ``mark(clone, adapt_label)`` flag cells, and SBR-refine.
+    Conforming (edge bisection + propagation, no hanging nodes), on-rank (no
+    redistribution), no external mesh libraries. Cell geometry / marking is done
+    on the CLONE (the un-cloned mesh DM raises err73 from computeCellGeometryFVM).
+
+    IMPORTANT: ``dm_plex_transform_type`` is a *global* PETSc option; leaving it as
+    ``refine_sbr`` breaks UW3's uniform ``dm.refine()`` (FMG hierarchy build) with
+    PETSc error 73. It is set only for the duration of the call and restored."""
+    opts = PETSc.Options()
+    had = opts.hasName("dm_plex_transform_type")
+    prev = opts.getString("dm_plex_transform_type") if had else None
+    opts.setValue("dm_plex_transform_type", "refine_sbr")
+    try:
+        d = dm.clone()
+        d.createLabel("adapt")
+        lab = d.getLabel("adapt")
+        lab.setDefaultValue(0)
+        mark(d, lab)
+        return d.adaptLabel("adapt")
+    finally:
+        if had:
+            opts.setValue("dm_plex_transform_type", prev)
+        else:
+            opts.delValue("dm_plex_transform_type")
+
+
+def sbr_refine(dm, cells):
+    """SBR-refine an explicit list of ``cells``."""
+    def mark(d, lab):
+        for c in cells:
+            lab.setValue(int(c), _DM_ADAPT_REFINE)
+    return _sbr_apply(dm, mark)
+
+
+def sbr_refine_where(dm, predicate):
+    """SBR-refine cells whose centroid satisfies ``predicate(centroid)``."""
+    def mark(d, lab):
+        cs, ce = d.getHeightStratum(0)
+        for c in range(cs, ce):
+            if predicate(d.computeCellGeometryFVM(c)[1]):
+                lab.setValue(c, _DM_ADAPT_REFINE)
+    return _sbr_apply(dm, mark)
+
+
 # --------------------------------------------------------------------------- #
 #  Coordinate helpers
 # --------------------------------------------------------------------------- #
@@ -138,50 +190,75 @@ def _reduce_to_global(dm, full_coords):
 
 
 # --------------------------------------------------------------------------- #
-#  Injection
+#  Layer 1 — generalized FMG hierarchy (BC-reduced, adapter-agnostic)
+#
+#  INVARIANT (load-bearing): essential BCs are applied at EVERY level, so every
+#  transfer maps reduced->reduced. PETSc native FMG does this automatically via
+#  each level's constrained PetscSection; custom-P built from raw coordinates must
+#  replicate it. Omitting it is fatal on an EXACTLY-nested hierarchy: a coarse
+#  boundary DOF coincides with a BC-removed fine DOF -> zero column -> singular
+#  Galerkin coarse operator. (See docs/developer/design/
+#  GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md.)
 # --------------------------------------------------------------------------- #
-def inject_custom_mg(solver):
-    """Configure ``solver``'s PCMG to use our custom prolongation hierarchy.
+def _field_subdm(dm, field_id):
+    """Return the sub-DM for ``field_id`` (whole dm if single-field / None)."""
+    if field_id is None:
+        return dm
+    try:
+        if dm.getNumFields() <= 1:
+            return dm
+    except Exception:
+        return dm
+    _iset, sub = dm.createSubDM(field_id)
+    return sub
 
-    Called from the solver's ``solve()`` (after ``_build``, before the SNES
-    solve) when ``solver._custom_mg`` is set via ``set_custom_mg``. Scalar /
-    single-field solvers only for now (the Stokes velocity-block path is a
-    separate increment).
-    """
-    cfg = solver._custom_mg
-    coarse_meshes = cfg["coarse_meshes"]
-    kind = cfg["kind"]
-    builder = _BUILDERS[kind]
 
-    var = solver.Unknowns.u
-    degree = var.degree
-    continuous = getattr(var, "continuous", True)
+def _reduced_map(dm, field_id=None):
+    """Reduced->full DOF map for a field on a BUILT DM (serial). Index ``k`` of
+    the returned array is the local DOF that maps to global/MG index ``k`` —
+    i.e. the BC-eliminated, MG-ordered layout the operator lives on.
 
-    if solver._pc_option_prefix not in ("", None):
-        raise NotImplementedError(
-            "custom_mg currently supports single-field (scalar/vector) solvers; "
-            f"solver uses PC prefix '{solver._pc_option_prefix}' (e.g. Stokes "
-            "velocity block) which is a separate increment.")
+    NOTE: serial-correct (uses local indices). Parallel correctness — mapping to
+    *global* numbering across ranks — is a Phase-3 item (the supported parallel
+    path keeps levels co-partitioned so this stays rank-local)."""
+    sub = _field_subdm(dm, field_id)
+    lv = sub.getLocalVec()
+    n_full = lv.getLocalSize()
+    lv.array[:] = np.arange(n_full)
+    gv = sub.getGlobalVec()
+    sub.localToGlobal(lv, gv, addv=False)
+    r2f = gv.array.astype(int).copy()
+    sub.restoreLocalVec(lv)
+    sub.restoreGlobalVec(gv)
+    return r2f, n_full
 
-    # --- per-level DOF coordinates --------------------------------------- #
-    fine = _reduce_to_global(solver.dm,
-                             solver.mesh._get_coords_for_basis(degree, continuous))
-    levels = [m._get_coords_for_basis(degree, continuous) for m in coarse_meshes]
-    levels.append(fine)
-    nlev = len(levels)
-    if nlev < 2:
-        raise ValueError("custom_mg needs at least one coarse mesh")
 
-    # --- build prolongations P[l]: level l-1 -> level l ------------------ #
-    Ps = [None] + [_to_petsc_aij(builder(levels[l - 1], levels[l]))
-                   for l in range(1, nlev)]
+def _reduced_transfer(coarse_coords, fine_coords, r2f_c, r2f_f, ncomp, builder):
+    """Build one prolongation reduced(coarse) -> reduced(fine):
+    node-level scalar P -> interleave ``ncomp`` components -> drop BC rows/cols."""
+    import scipy.sparse as sp
+    Pn = builder(coarse_coords, fine_coords)               # (n_f_nodes, n_c_nodes)
+    Pv = sp.kron(Pn, sp.eye(ncomp), format="csr")          # interleaved full vector
+    Pr = Pv.tocsr()[r2f_f, :][:, r2f_c]                    # reduced -> reduced
+    return Pr
+
 
-    # --- configure the geometric MG bundle on the managed block ---------- #
+def _install_transfers(solver, Ps, verbose=False):
+    """Configure the managed PCMG block to use the supplied prolongations.
+    Build-time injection (before first PCSetUp): set DMActive(OPERATOR,False) so
+    PETSc does not re-derive interpolation from the DM, Galerkin RAP for coarse
+    operators. Scalar / single-field-vector (top-level PC: ``_pc_option_prefix``
+    is ``""``); the Stokes velocity-block path is Phase 2."""
+    nlev = len(Ps) + 1
     opts = solver.petsc_options
     pfx = solver._pc_option_prefix or ""
+    if pfx not in ("",):
+        raise NotImplementedError(
+            "Layer-1 install supports top-level PC (scalar / single-field vector). "
+            f"PC prefix '{pfx}' (e.g. Stokes velocity block) is Phase 2.")
     opts[pfx + "pc_type"] = "mg"
     opts[pfx + "pc_mg_type"] = "full"
-    opts[pfx + "pc_mg_galerkin"] = "both"          # coarse operators = PᵀAP
+    opts[pfx + "pc_mg_galerkin"] = "both"
     opts[pfx + "mg_levels_ksp_type"] = "richardson"
     opts[pfx + "mg_levels_pc_type"] = "sor"
     opts[pfx + "mg_coarse_pc_type"] = "redundant"
@@ -189,20 +266,146 @@ def inject_custom_mg(solver):
     for key in ("pc_gamg_type", "pc_gamg_repartition", "pc_gamg_agg_nsmooths"):
         opts.delValue(pfx + key)
 
-    # --- inject before the first PCSetUp --------------------------------- #
     solver.snes.setUp()
     ksp = solver.snes.getKSP()
-    # Stop PETSc re-deriving interpolation/operators from the DM hierarchy.
     ksp.setDMActive(PETSc.KSP.DMActive.OPERATOR, False)
     pc = ksp.getPC()
     pc.setType("mg")
     pc.setMGLevels(nlev)
     pc.setMGType(PETSc.PC.MGType.FULL)
     for l in range(1, nlev):
-        pc.setMGInterpolation(l, Ps[l])
+        pc.setMGInterpolation(l, Ps[l - 1])
     pc.setFromOptions()
-
-    if cfg.get("verbose"):
+    if verbose:
         from underworld3 import mpi
-        mpi.pprint(f"[{solver.name}] custom_mg ({kind}): levels "
-                   f"{[lv.shape[0] for lv in levels]}")
+        mpi.pprint(f"[{solver.name}] custom FMG installed: {nlev} levels, "
+                   f"P sizes {[tuple(P.getSize()) for P in Ps]}")
+
+
+class CustomMGHierarchy:
+    """A generalized FMG hierarchy: a sequence of level meshes (coarsest..finest)
+    whose transfers are built by a pluggable builder, with BCs applied at every
+    level. Adapter-agnostic — it consumes meshes + a way to get each level's
+    BC-reduced DOF map; it does not know how the levels were produced.
+
+    Parameters
+    ----------
+    level_meshes : list of Mesh
+        Coarsest-first; the LAST entry must be the solver's own mesh.
+    builder : {"barycentric", "rbf"}
+        Per-level node prolongation builder.
+    field_id : int or None
+        Field index for multi-field solvers (e.g. 0 = velocity); None = single field.
+    """
+
+    def __init__(self, level_meshes, builder="barycentric", field_id=None):
+        if builder not in _BUILDERS:
+            raise ValueError("builder must be 'barycentric' or 'rbf'")
+        if len(level_meshes) < 2:
+            raise ValueError("need at least 2 levels (>=1 coarse + finest)")
+        self.level_meshes = list(level_meshes)
+        self.builder = _BUILDERS[builder]
+        self.builder_name = builder
+        self.field_id = field_id
+        self.transfers = None
+
+    def build(self, solver, level_solver_factory):
+        """Build the BC-reduced prolongations. ``solver`` is the (built) finest
+        solver; ``level_solver_factory(mesh) -> built solver`` provides a
+        same-discretisation solver on each COARSE level so its constrained
+        section gives that level's reduced map. The finest level uses ``solver``."""
+        var = solver.Unknowns.u
+        degree = var.degree
+        continuous = getattr(var, "continuous", True)
+        nlev = len(self.level_meshes)
+
+        coords, r2f, ncomp = [], [], []
+        for k, mesh in enumerate(self.level_meshes):
+            c = np.asarray(mesh._get_coords_for_basis(degree, continuous))
+            if k == nlev - 1:
+                rmap, nfull = _reduced_map(solver.dm, self.field_id)
+            else:
+                tmp = level_solver_factory(mesh)
+                tmp._build(False, False, None)
+                tmp.snes.setUp()
+                rmap, nfull = _reduced_map(tmp.dm, self.field_id)
+            nc = nfull // c.shape[0]
+            if nfull % c.shape[0] != 0:
+                raise RuntimeError(
+                    f"level {k}: full DOFs {nfull} not divisible by nodes {c.shape[0]}")
+            coords.append(c); r2f.append(rmap); ncomp.append(nc)
+
+        if len(set(ncomp)) != 1:
+            raise RuntimeError(f"inconsistent component counts across levels: {ncomp}")
+        nc = ncomp[0]
+
+        Ps = []
+        for l in range(1, nlev):
+            Pr = _reduced_transfer(coords[l - 1], coords[l], r2f[l - 1], r2f[l],
+                                   nc, self.builder)
+            zc = int((np.asarray((Pr != 0).sum(axis=0)).ravel() == 0).sum())
+            if zc:
+                raise RuntimeError(
+                    f"transfer {l-1}->{l} has {zc} zero columns (coarse DOFs with no "
+                    f"fine image) — BC-per-level reduction failed; coarse operator "
+                    f"would be singular.")
+            Ps.append(_to_petsc_aij(Pr))
+        self.transfers = Ps
+        return Ps
+
+    def install(self, solver, verbose=False):
+        if self.transfers is None:
+            raise RuntimeError("call build() before install()")
+        _install_transfers(solver, self.transfers, verbose=verbose)
+
+
+# --------------------------------------------------------------------------- #
+#  Entry points
+# --------------------------------------------------------------------------- #
+def set_custom_fmg(solver, coarse_meshes, *, level_solver_factory,
+                   builder="barycentric", field_id=None, verbose=False):
+    """Generalized custom-P FMG with BC-per-level reduction (the correct path).
+
+    Registers a :class:`CustomMGHierarchy` on the solver so that the next
+    ``solve()`` builds and installs it (build-time injection). The hierarchy is
+    ``[*coarse_meshes, solver.mesh]``; ``level_solver_factory(mesh)`` must return
+    a same-discretisation solver on a coarse level (used only to read its
+    BC-constrained section)."""
+    solver._custom_mg = {
+        "mode": "hierarchy",
+        "hierarchy": CustomMGHierarchy(list(coarse_meshes) + [solver.mesh],
+                                       builder=builder, field_id=field_id),
+        "factory": level_solver_factory,
+        "verbose": verbose,
+    }
+    solver.is_setup = False
+
+
+def inject_custom_mg(solver):
+    """Build + install the custom-P FMG. Called from ``solve()`` (after ``_build``,
+    before the SNES solve) when ``solver._custom_mg`` is set. Dispatches:
+    - ``mode == "hierarchy"`` -> BC-per-level reduced path (correct, general);
+    - legacy dict ``{coarse_meshes, kind}`` -> finest-only reduction (kept for
+      back-compat; valid only when coarse levels are non-nested / unconstrained)."""
+    cfg = solver._custom_mg
+
+    if isinstance(cfg, dict) and cfg.get("mode") == "hierarchy":
+        h = cfg["hierarchy"]
+        h.build(solver, cfg["factory"])
+        h.install(solver, verbose=cfg.get("verbose", False))
+        return
+
+    # ---- legacy finest-only path (back-compat) ------------------------------
+    coarse_meshes = cfg["coarse_meshes"]
+    builder = _BUILDERS[cfg["kind"]]
+    var = solver.Unknowns.u
+    degree = var.degree
+    continuous = getattr(var, "continuous", True)
+    if solver._pc_option_prefix not in ("", None):
+        raise NotImplementedError("legacy custom_mg path is scalar/single-field only")
+    fine = _reduce_to_global(solver.dm,
+                             solver.mesh._get_coords_for_basis(degree, continuous))
+    levels = [m._get_coords_for_basis(degree, continuous) for m in coarse_meshes]
+    levels.append(fine)
+    Ps = [_to_petsc_aij(builder(levels[l - 1], levels[l])) for l in range(1, len(levels))]
+    _install_transfers(solver, Ps, verbose=cfg.get("verbose", False))
diff --git a/tests/test_1016_custom_mg_hierarchy.py b/tests/test_1016_custom_mg_hierarchy.py
new file mode 100644
index 00000000..60c873b2
--- /dev/null
+++ b/tests/test_1016_custom_mg_hierarchy.py
@@ -0,0 +1,60 @@
+"""Layer-1 generalized FMG hierarchy: CustomMGHierarchy + set_custom_fmg with
+BC-per-level reduction, over a uniform + SBR-targeted nested hierarchy."""
+import numpy as np, sympy, pytest, underworld3 as uw
+from petsc4py import PETSc
+from underworld3.utilities import custom_mg
+
+pytestmark = [pytest.mark.level_2, pytest.mark.tier_b]
+
+def _sbr(dm, band=0.2):
+    # scoped SBR (restores dm_plex_transform_type so uniform refine() still works)
+    return custom_mg.sbr_refine_where(dm, lambda cen: abs(cen[0]-0.5) < band)
+def _wrap(dm,m0):
+    return uw.discretisation.Mesh(dm.clone(), simplex=True,
+        coordinate_system_type=m0.CoordinateSystem.coordinate_type, qdegree=3, boundaries=m0.boundaries)
+def _poisson(mesh):
+    p=uw.systems.Poisson(mesh); p.constitutive_model=uw.constitutive_models.DiffusionModel
+    x,y=mesh.X
+    p.constitutive_model.Parameters.diffusivity=sympy.Piecewise((1.0,x<0.5),(1000.0,True))
+    p.f=8*sympy.pi**2*sympy.sin(2*sympy.pi*x)*sympy.sin(2*sympy.pi*y)
+    for b in ("Bottom","Top","Left","Right"): p.add_dirichlet_bc(0.0,b)
+    p.petsc_options["ksp_rtol"]=1e-8; p.petsc_options["ksp_type"]="cg"
+    return p
+
+def _hierarchy():
+    m0=uw.meshing.UnstructuredSimplexBox(minCoords=(0,0),maxCoords=(1,1),cellSize=0.5,regular=True,qdegree=3)
+    dm0=m0.dm; dm1=dm0.refine(); dm2=_sbr(dm1)          # 2 uniform-ish + 1 SBR
+    return m0, [_wrap(dm0,m0), _wrap(dm1,m0)], _wrap(dm2,m0)
+
+def test_custom_fmg_hierarchy_converges():
+    m0, coarse, fine = _hierarchy()
+    s=_poisson(fine)
+    custom_mg.set_custom_fmg(s, coarse, level_solver_factory=_poisson, builder="barycentric")
+    s.solve()
+    assert s.snes.getKSP().getPC().getType()=="mg"
+    assert s.snes.getKSP().getPC().getMGLevels()==3
+    assert s.snes.getConvergedReason()>0
+    # matches a GAMG solve of the same fine problem
+    g=_poisson(_wrap(fine.dm,m0)); g.preconditioner="gamg"; g.solve()
+    rel=np.linalg.norm(s.Unknowns.u.data-g.Unknowns.u.data)/np.linalg.norm(g.Unknowns.u.data)
+    assert rel < 1e-4
+
+def test_bc_per_level_reduction_no_zero_columns():
+    """build() must produce transfers with no zero columns (BCs at every level)."""
+    m0, coarse, fine = _hierarchy()
+    s=_poisson(fine); s._build(False,False,None); s.snes.setUp()
+    h=custom_mg.CustomMGHierarchy(coarse+[fine], builder="barycentric")
+    Ps=h.build(s, _poisson)                # raises if any transfer has zero columns
+    assert len(Ps)==2
+    for P in Ps:
+        Pc=P.getValuesCSR(); import scipy.sparse as sp
+        M=sp.csr_matrix(Pc[::-1], shape=P.getSize())
+        assert int((np.asarray((M!=0).sum(axis=0)).ravel()==0).sum())==0
+
+def test_rbf_builder_also_works():
+    m0, coarse, fine = _hierarchy()
+    s=_poisson(fine)
+    custom_mg.set_custom_fmg(s, coarse, level_solver_factory=_poisson, builder="rbf")
+    s.solve()
+    assert s.snes.getKSP().getPC().getType()=="mg"
+    assert s.snes.getConvergedReason()>0

From 6eb0b340d8e48d6762971c9c49d269b26eab51a8 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 06:36:58 +1000
Subject: [PATCH 03/15] docs(custom_mg): Layer-1 status + remaining hardening
 before merge

Layer 1 (generalized FMG hierarchy) is an independent, working capability
(arbitrary nested/non-nested coarse grids -> geometric FMG, BC-per-level).
Current scope experimental: scalar/single-field, serial, factory API. Remaining
before merge-ready general feature: Stokes velocity-block, drop the factory
(label-based BC reduction), parallel.

Underworld development team with AI support from Claude Code
---
 .../GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md    | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
index cb2908cc..e0016959 100644
--- a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
+++ b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
@@ -114,6 +114,31 @@ Later (not this pass; do not block): dynamic per-step reallocation loop; efficie
 (transfer caching, exact-nested combinatorial P, operator reuse); load-balancing adapters;
 non-nested parallel transfers; knockout-as-coarsening as an alternative Layer-2 strategy.
 
+## Status (2026-06-29)
+
+**Layer 1 is an independent, working capability** — arbitrary coarse grids
+(nested *or* non-nested, uniform *or* SBR-refined) → geometric FMG via custom-P
+with BC-per-level reduction. It does not depend on Layer 2.
+
+Landed (committed, tested — `test_1014/1015/1016`, 18 pass):
+- `CustomMGHierarchy`, `set_custom_fmg`, `sbr_refine`/`sbr_refine_where`;
+- automatic BC-per-level reduction + zero-column guard;
+- validated: scalar jump-coeff Poisson, 5-level (3 uniform + 2 SBR), 3 FMG iters
+  vs GAMG 46.
+
+**Current scope = experimental:** scalar / single-field-vector (top-level PC),
+**serial**, and `set_custom_fmg` takes a `level_solver_factory`.
+
+Remaining hardening **before** this is a merge-ready general feature (the
+relaunch's first task, in order):
+1. **Stokes / saddle-point** (velocity-block injection; `_install_transfers`
+   rejects `fieldsplit_velocity_`). Blocker: velocity sub-PC unreachable pre-solve
+   → use Galerkin-off + `MatPtAP` coarse ops after first solve. + nullspace re-attach.
+2. **Drop the factory** — derive per-level constrained DOFs from boundary labels +
+   BC components (no throwaway solver) for a clean API.
+3. **Parallel (np>1)** — `_reduced_map` is serial; validate co-located rank-local
+   transfers (fast-follow).
+
 ## Explicit non-goals for this pass
 - Knockout (shown to pay full-fine assembly; structural value only) — not pursued now.
 - Non-nested custom-P in parallel — serial/experimental only.

From 2dc60e3333e5bd51c9025bab8745c994436cbf79 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 13:14:18 +1000
Subject: [PATCH 04/15] feat(custom_mg): Stokes velocity-block custom-P FMG
 (Phase 1, Step 1)

Integrate the generalized FMG hierarchy into the saddle-point solver's
velocity sub-block. set_custom_fmg(..., field_id=0) now drives geometric
multigrid on the velocity block with our supplied (barycentric / RBF)
prolongations + Galerkin RAP.

The velocity sub-PC is unreachable until the monolithic Jacobian is
assembled (PCFieldSplit forms A_vv via MatCreateSubMatrix; snes.setUp
builds structure only -> err73). So _install_velocity_block_transfers:
forces a Jacobian assembly (computeFunction + computeJacobian at the zero
guess; throwaway max_it=0 fallback), reaches the velocity sub-PC,
reset()s it and rebuilds a FRESH PCMG from our P (mirrors the proven
standalone recipe; sidesteps the MatProductReplaceMats live-swap bug),
re-attaches the coupled Stokes nullspace. _configure_pcmg now derives the
options prefix from pc.getOptionsPrefix() so both the scalar top-level PC
and the velocity sub-PC are configured correctly.

inject_custom_mg is wired into SNES_Stokes_SaddlePt.solve (guarded no-op
unless set_custom_fmg was called), injected after setFromOptions /
nullspace, before the real solve.

Validated (SolCx eta_B=1e6, 3-level nested hierarchy, in-solver via
solver.solve()): velocity block converges in 6 MG iters vs GAMG 198,
solution matches the GAMG reference to 1.5e-9. test_1017 (barycentric +
rbf). test_1014/1015/1016 unchanged (20 pass total).

Underworld development team with AI support from Claude Code
---
 .../cython/petsc_generic_snes_solvers.pyx     |  14 ++
 src/underworld3/utilities/custom_mg.py        | 138 ++++++++++++++----
 tests/test_1017_custom_mg_stokes.py           | 119 +++++++++++++++
 3 files changed, 245 insertions(+), 26 deletions(-)
 create mode 100644 tests/test_1017_custom_mg_stokes.py

diff --git a/src/underworld3/cython/petsc_generic_snes_solvers.pyx b/src/underworld3/cython/petsc_generic_snes_solvers.pyx
index 35c57ae3..58f21ab1 100644
--- a/src/underworld3/cython/petsc_generic_snes_solvers.pyx
+++ b/src/underworld3/cython/petsc_generic_snes_solvers.pyx
@@ -7458,6 +7458,13 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
             self.petsc_options.setValue("snes_max_it", snes_max_it)
             self.snes.setFromOptions()
             self._attach_stokes_nullspace()
+            # Custom geometric-MG prolongation on the velocity block (if registered
+            # via set_custom_fmg). Injected here — after setFromOptions/nullspace,
+            # before the real solve — because the velocity sub-PC is only reachable
+            # once the monolithic Jacobian is assembled (see custom_mg).
+            if self._custom_mg is not None:
+                from underworld3.utilities.custom_mg import inject_custom_mg
+                inject_custom_mg(self)
             self._snes_solve_with_retries(gvec, divergence_retries, verbose)
 
         else:
@@ -7468,6 +7475,13 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
             self.petsc_options.setValue("snes_max_it", snes_max_it)
             self.snes.setFromOptions()
             self._attach_stokes_nullspace()
+            # Custom geometric-MG prolongation on the velocity block (if registered
+            # via set_custom_fmg). Injected here — after setFromOptions/nullspace,
+            # before the real solve — because the velocity sub-PC is only reachable
+            # once the monolithic Jacobian is assembled (see custom_mg).
+            if self._custom_mg is not None:
+                from underworld3.utilities.custom_mg import inject_custom_mg
+                inject_custom_mg(self)
             self._snes_solve_with_retries(gvec, divergence_retries, verbose)
 
         # Project the rigid-body rotation gauge out of the converged solution.
diff --git a/src/underworld3/utilities/custom_mg.py b/src/underworld3/utilities/custom_mg.py
index b6fea9c3..187634db 100644
--- a/src/underworld3/utilities/custom_mg.py
+++ b/src/underworld3/utilities/custom_mg.py
@@ -243,42 +243,128 @@ def _reduced_transfer(coarse_coords, fine_coords, r2f_c, r2f_f, ncomp, builder):
     return Pr
 
 
-def _install_transfers(solver, Ps, verbose=False):
-    """Configure the managed PCMG block to use the supplied prolongations.
-    Build-time injection (before first PCSetUp): set DMActive(OPERATOR,False) so
-    PETSc does not re-derive interpolation from the DM, Galerkin RAP for coarse
-    operators. Scalar / single-field-vector (top-level PC: ``_pc_option_prefix``
-    is ``""``); the Stokes velocity-block path is Phase 2."""
+def _configure_pcmg(pc, Ps):
+    """Reconfigure ``pc`` as a fresh PCMG (FMG F-cycle) driven by the supplied
+    reduced->reduced prolongations ``Ps``, Galerkin RAP for coarse operators.
+
+    Writes the MG bundle into the options DB under the PC's OWN options prefix
+    (``pc.getOptionsPrefix()`` — e.g. ``Solver_N_`` for the scalar top-level PC,
+    ``Solver_N_fieldsplit_velocity_`` for the Stokes velocity sub-PC) and removes
+    any gamg keys BEFORE ``setFromOptions`` — otherwise ``setFromOptions`` re-reads
+    a lingering ``pc_type=gamg`` and reverts ``setType("mg")``. ``setMGInterpolation``
+    persists through ``setFromOptions``; the first ``PCSetUp`` builds the coarse
+    operators from our P (no ``MatProductReplaceMats`` shape bug, since the PCMG is
+    fresh and P's size is fixed)."""
     nlev = len(Ps) + 1
-    opts = solver.petsc_options
-    pfx = solver._pc_option_prefix or ""
-    if pfx not in ("",):
-        raise NotImplementedError(
-            "Layer-1 install supports top-level PC (scalar / single-field vector). "
-            f"PC prefix '{pfx}' (e.g. Stokes velocity block) is Phase 2.")
-    opts[pfx + "pc_type"] = "mg"
-    opts[pfx + "pc_mg_type"] = "full"
-    opts[pfx + "pc_mg_galerkin"] = "both"
-    opts[pfx + "mg_levels_ksp_type"] = "richardson"
-    opts[pfx + "mg_levels_pc_type"] = "sor"
-    opts[pfx + "mg_coarse_pc_type"] = "redundant"
-    opts[pfx + "mg_coarse_redundant_pc_type"] = "lu"
+    prefix = pc.getOptionsPrefix() or ""
+    opts = PETSc.Options()
+    opts.setValue(prefix + "pc_type", "mg")
+    opts.setValue(prefix + "pc_mg_type", "full")
+    opts.setValue(prefix + "pc_mg_galerkin", "both")
+    opts.setValue(prefix + "mg_levels_ksp_type", "richardson")
+    opts.setValue(prefix + "mg_levels_pc_type", "sor")
+    opts.setValue(prefix + "mg_coarse_pc_type", "redundant")
+    opts.setValue(prefix + "mg_coarse_redundant_pc_type", "lu")
     for key in ("pc_gamg_type", "pc_gamg_repartition", "pc_gamg_agg_nsmooths"):
-        opts.delValue(pfx + key)
-
-    solver.snes.setUp()
-    ksp = solver.snes.getKSP()
-    ksp.setDMActive(PETSc.KSP.DMActive.OPERATOR, False)
-    pc = ksp.getPC()
+        opts.delValue(prefix + key)
     pc.setType("mg")
     pc.setMGLevels(nlev)
     pc.setMGType(PETSc.PC.MGType.FULL)
     for l in range(1, nlev):
         pc.setMGInterpolation(l, Ps[l - 1])
     pc.setFromOptions()
+
+
+def _install_transfers(solver, Ps, verbose=False):
+    """Configure the managed PCMG block to use the supplied prolongations.
+
+    Two paths, keyed by ``solver._pc_option_prefix``:
+
+    * **scalar / single-field vector** (top-level PC, prefix ``""``): build-time
+      injection *before* the first ``PCSetUp`` — the first Galerkin assembly is
+      built from our P directly.
+    * **Stokes velocity block** (prefix ``"fieldsplit_velocity_"``): the velocity
+      sub-PC is unreachable until the monolithic Jacobian is assembled
+      (``PCFieldSplit`` forms ``A_vv`` via ``MatCreateSubMatrix``; ``snes.setUp``
+      builds structure only -> err73). So force a Jacobian assembly, reach the
+      velocity sub-PC, ``reset`` it and rebuild a fresh PCMG from our P, then
+      re-attach the coupled Stokes nullspace.
+
+    Either way ``KSPSetDMActive(OPERATOR, False)`` stops PETSc re-deriving the
+    interpolation from the DM hierarchy."""
+    nlev = len(Ps) + 1
+    pfx = solver._pc_option_prefix or ""
+
+    if pfx == "":
+        ksp = solver.snes.getKSP()
+        solver.snes.setUp()
+        ksp.setDMActive(PETSc.KSP.DMActive.OPERATOR, False)
+        _configure_pcmg(ksp.getPC(), Ps)
+        if verbose:
+            from underworld3 import mpi
+            mpi.pprint(f"[{solver.name}] custom FMG installed: {nlev} levels, "
+                       f"P sizes {[tuple(P.getSize()) for P in Ps]}")
+        return
+
+    if pfx != "fieldsplit_velocity_":
+        raise NotImplementedError(
+            f"custom_mg install: unsupported PC prefix '{pfx}'.")
+
+    _install_velocity_block_transfers(solver, Ps, verbose=verbose)
+
+
+def _install_velocity_block_transfers(solver, Ps, verbose=False):
+    """Stokes velocity-block install (mechanism A: reset + fresh PCMG).
+
+    Preconditions: ``solver._build`` + ``setFromOptions`` + ``_attach_stokes_nullspace``
+    have run (the call site in ``SNES_Stokes_SaddlePt.solve`` guarantees this), so
+    the SNES / DM exist but the Jacobian VALUES are not yet assembled."""
+    snes = solver.snes
+    snes.setUp()
+
+    # 1. force monolithic Jacobian assembly so the fieldsplit can form A_vv
+    x0 = solver.dm.getGlobalVec()
+    x0.set(0.0)
+    J = snes.getJacobian()[0]
+    Pmat = snes.getJacobian()[1]
+    try:
+        f = J.createVecLeft()
+        snes.computeFunction(x0, f)
+        snes.computeJacobian(x0, J, Pmat)
+    except PETSc.Error:
+        # fallback: throwaway max_it=0 solve assembles + splits the operator
+        saved = (solver.petsc_options.getString("snes_max_it")
+                 if solver.petsc_options.hasName("snes_max_it") else None)
+        solver.petsc_options["snes_max_it"] = 0
+        snes.setFromOptions()
+        snes.solve(None, x0)
+        if saved is not None:
+            solver.petsc_options["snes_max_it"] = saved
+        snes.setFromOptions()
+    solver.dm.restoreGlobalVec(x0)
+
+    # 2. split -> reach the velocity sub-KSP / sub-PC (field 0)
+    outer_pc = snes.getKSP().getPC()
+    outer_pc.setUp()
+    vel_ksp = outer_pc.getFieldSplitSubKSP()[0]
+    vel_pc = vel_ksp.getPC()
+    sub = vel_pc.getOptionsPrefix() or "fieldsplit_velocity_"
+    A_vv, P_vv = vel_pc.getOperators()        # capture before reset (reset drops them)
+
+    # 3. fresh PCMG on the velocity sub-block from our Ps
+    vel_ksp.setDMActive(PETSc.KSP.DMActive.OPERATOR, False)
+    vel_pc.reset()
+    vel_pc.setOperators(A_vv, P_vv)
+    _configure_pcmg(vel_pc, Ps)
+    vel_pc.setUp()
+
+    # 4. re-attach the coupled Stokes nullspace (operator state was touched)
+    solver._attach_stokes_nullspace()
+
     if verbose:
         from underworld3 import mpi
-        mpi.pprint(f"[{solver.name}] custom FMG installed: {nlev} levels, "
+        mpi.pprint(f"[{solver.name}] custom FMG installed on velocity block: "
+                   f"{len(Ps) + 1} levels, sub-prefix {sub!r}, "
                    f"P sizes {[tuple(P.getSize()) for P in Ps]}")
 
 
diff --git a/tests/test_1017_custom_mg_stokes.py b/tests/test_1017_custom_mg_stokes.py
new file mode 100644
index 00000000..5abe7b42
--- /dev/null
+++ b/tests/test_1017_custom_mg_stokes.py
@@ -0,0 +1,119 @@
+"""Layer-1 generalized FMG hierarchy on the STOKES velocity block (Phase-1 Step 1).
+
+Custom-built prolongations (barycentric / RBF) drive geometric multigrid on the
+velocity sub-block of the saddle-point solver, via set_custom_fmg(field_id=0).
+Validated on SolCx (eta_B=1e6): the velocity block converges in a handful of MG
+iterations (vs ~200 for GAMG) and the solution matches a GAMG reference.
+
+The hard part (see custom_mg._install_velocity_block_transfers): the velocity
+sub-PC is unreachable until the monolithic Jacobian is assembled, so the install
+forces a Jacobian assembly, reaches the velocity sub-PC, and rebuilds a fresh PCMG
+from our supplied prolongations.
+"""
+import numpy as np
+import pytest
+import underworld3 as uw
+from underworld3.function import analytic as A
+from underworld3.utilities import custom_mg
+
+pytestmark = [pytest.mark.level_2, pytest.mark.tier_b]
+
+
+def _wrap(dm, m0):
+    return uw.discretisation.Mesh(
+        dm.clone(), simplex=True,
+        coordinate_system_type=m0.CoordinateSystem.coordinate_type,
+        qdegree=3, boundaries=m0.boundaries)
+
+
+def _hierarchy():
+    """cellSize 0.25 base, two uniform refinements -> 3 nested levels."""
+    m0 = uw.meshing.UnstructuredSimplexBox(
+        minCoords=(0, 0), maxCoords=(1, 1), cellSize=0.25, regular=True, qdegree=3)
+    dm0 = m0.dm
+    dm1 = dm0.refine()
+    dm2 = dm1.refine()
+    meshes = [_wrap(dm0, m0), _wrap(dm1, m0), _wrap(dm2, m0)]
+    return meshes[:-1], meshes[-1]
+
+
+def _walls(s):
+    s.add_dirichlet_bc((0.0, None), "Left")
+    s.add_dirichlet_bc((0.0, None), "Right")
+    s.add_dirichlet_bc((None, 0.0), "Bottom")
+    s.add_dirichlet_bc((None, 0.0), "Top")
+    s.petsc_use_pressure_nullspace = True
+    s.petsc_options["snes_type"] = "ksponly"
+
+
+def _coarse_factory(mesh):
+    """Same-discretisation coarse-level Stokes (only its constrained section is
+    read by CustomMGHierarchy.build to derive the BC-reduced DOF map)."""
+    s = uw.systems.Stokes(mesh)
+    s.constitutive_model = uw.constitutive_models.ViscousFlowModel
+    s.constitutive_model.Parameters.shear_viscosity_0 = 1.0
+    _walls(s)
+    return s
+
+
+def _solcx(mesh, eta_B=1e6):
+    sol = A.SolCx(mesh, eta_A=1.0, eta_B=eta_B, x_c=0.5, n=1)
+    s = uw.systems.Stokes(mesh)
+    s.constitutive_model = uw.constitutive_models.ViscousFlowModel
+    s.constitutive_model.Parameters.shear_viscosity_0 = sol.fn_viscosity
+    s.saddle_preconditioner = 1.0 / sol.fn_viscosity
+    s.bodyforce = sol.fn_bodyforce
+    _walls(s)
+    s.tolerance = 1e-8
+    return s, sol
+
+
+def _vel_ksp(s):
+    return s.snes.getKSP().getPC().getFieldSplitSubKSP()[0]
+
+
+def test_custom_fmg_velocity_block_barycentric():
+    """Custom barycentric P drives geometric MG on the SolCx velocity block,
+    converging in few iterations and matching a GAMG reference solution."""
+    coarse, fine = _hierarchy()
+
+    # GAMG reference
+    sg, solg = _solcx(fine)
+    sg.preconditioner = "gamg"
+    sg.solve()
+    verr_g = solg.velocity_error(sg.u)
+    iters_g = _vel_ksp(sg).getIterationNumber()
+
+    # custom-P geometric MG on the velocity block
+    s, sol = _solcx(fine)
+    custom_mg.set_custom_fmg(s, coarse, level_solver_factory=_coarse_factory,
+                             builder="barycentric", field_id=0)
+    s.solve()
+    vksp = _vel_ksp(s)
+
+    assert s.snes.getConvergedReason() > 0
+    assert vksp.getPC().getType() == "mg"
+    assert vksp.getPC().getMGLevels() == len(coarse) + 1
+    # geometric MG crushes GAMG on the eta-jump velocity block
+    assert vksp.getIterationNumber() <= 15
+    assert vksp.getIterationNumber() < iters_g
+    # correct solution: matches analytic to GAMG's accuracy, and matches GAMG itself
+    verr = sol.velocity_error(s.u)
+    assert verr < 2.0 * verr_g + 1e-6
+    rel = np.linalg.norm(s.u.data - sg.u.data) / (np.linalg.norm(sg.u.data) + 1e-30)
+    assert rel < 1e-4
+
+
+def test_custom_fmg_velocity_block_rbf():
+    """The RBF builder must also drive a converging velocity-block MG solve."""
+    coarse, fine = _hierarchy()
+    s, sol = _solcx(fine)
+    custom_mg.set_custom_fmg(s, coarse, level_solver_factory=_coarse_factory,
+                             builder="rbf", field_id=0)
+    s.solve()
+    vksp = _vel_ksp(s)
+    assert s.snes.getConvergedReason() > 0
+    assert vksp.getPC().getType() == "mg"
+    assert vksp.getPC().getMGLevels() == len(coarse) + 1
+    assert vksp.getIterationNumber() <= 25
+    assert sol.velocity_error(s.u) < 1e-2

From cb77289e52c7347f5dc5a06265116f4584004303 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 13:14:44 +1000
Subject: [PATCH 05/15] docs(custom_mg): mark Stokes velocity-block integration
 done (Phase 1, Step 1)

Underworld development team with AI support from Claude Code
---
 .../GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md    | 28 +++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
index e0016959..19bbc72b 100644
--- a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
+++ b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
@@ -126,14 +126,26 @@ Landed (committed, tested — `test_1014/1015/1016`, 18 pass):
 - validated: scalar jump-coeff Poisson, 5-level (3 uniform + 2 SBR), 3 FMG iters
   vs GAMG 46.
 
-**Current scope = experimental:** scalar / single-field-vector (top-level PC),
-**serial**, and `set_custom_fmg` takes a `level_solver_factory`.
-
-Remaining hardening **before** this is a merge-ready general feature (the
-relaunch's first task, in order):
-1. **Stokes / saddle-point** (velocity-block injection; `_install_transfers`
-   rejects `fieldsplit_velocity_`). Blocker: velocity sub-PC unreachable pre-solve
-   → use Galerkin-off + `MatPtAP` coarse ops after first solve. + nullspace re-attach.
+**Current scope = experimental:** **serial**, and `set_custom_fmg` takes a
+`level_solver_factory`. Scalar / single-field-vector **and** Stokes velocity-block
+are now supported.
+
+Remaining hardening **before** this is a merge-ready general feature (in order):
+1. ~~**Stokes / saddle-point** (velocity-block injection).~~ **DONE** (2026-06-29).
+   `set_custom_fmg(..., field_id=0)` drives custom-P geometric MG on the velocity
+   sub-block. The sub-PC is unreachable until the monolithic Jacobian is assembled,
+   so `_install_velocity_block_transfers` forces a Jacobian assembly
+   (`computeFunction`+`computeJacobian` at the zero guess; `max_it=0` fallback),
+   reaches the velocity sub-PC, `reset`s it and rebuilds a **fresh PCMG** from our P
+   (mechanism A — mirrors the proven standalone recipe and sidesteps the
+   `MatProductReplaceMats` live-swap bug; the Galerkin-off + `MatPtAP` path remains
+   the documented fallback), then re-attaches the coupled Stokes nullspace. Wired
+   into `SNES_Stokes_SaddlePt.solve` as a guarded no-op. Validated on SolCx
+   (η-jump 1e6, 3-level nested, in-solver): velocity block 6 MG iters vs GAMG 198,
+   solution matches GAMG to 1.5e-9 (`test_1017`). NOTE: free-slip velocity
+   rigid-body modes on `A_vv` + coarse ops are **not yet** handled (reusing
+   `_attach_stokes_nullspace` covers the SolCx pressure-nullspace case only) — a
+   Phase-2.5 follow-up.
 2. **Drop the factory** — derive per-level constrained DOFs from boundary labels +
    BC components (no throwaway solver) for a clean API.
 3. **Parallel (np>1)** — `_reduced_map` is serial; validate co-located rank-local

From e0181ba0017f873833af3585eac7d31d8f913a31 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 13:37:18 +1000
Subject: [PATCH 06/15] refactor(custom_mg): drop level_solver_factory (Phase
 1, Step 2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each coarse level's BC-constrained reduced map is now derived directly
from the coarse mesh DM via _coarse_reduced_map: clone the DM, copy the
(built) finest solver's fields + DS onto it, createDS, read the global
section. The DS carries UW's exact essential-BC definitions and is a
topology-independent discretisation spec, so it constrains the matching
boundary DOFs on ANY coarse mesh that shares the solver's boundary labels
(nested or not).

This removes the throwaway-solver factory (heavy, leak-prone, and an API
wart): set_custom_fmg and CustomMGHierarchy.build no longer take a
level_solver_factory. Leak-free — DM ops only, no SNES / JIT.

Validated identical to the old factory path (reduced maps byte-equal:
126 + 510 DOFs on the SolCx 3-level hierarchy). test_1014/1015/1016/1017
all green (20). custom-P vs native FMG unchanged (6 vs 5 iters, sol match
3.7e-10).

Underworld development team with AI support from Claude Code
---
 src/underworld3/utilities/custom_mg.py | 50 ++++++++++++++++++--------
 tests/test_1016_custom_mg_hierarchy.py |  6 ++--
 tests/test_1017_custom_mg_stokes.py    | 16 ++-------
 3 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/src/underworld3/utilities/custom_mg.py b/src/underworld3/utilities/custom_mg.py
index 187634db..611638f6 100644
--- a/src/underworld3/utilities/custom_mg.py
+++ b/src/underworld3/utilities/custom_mg.py
@@ -233,6 +233,27 @@ def _reduced_map(dm, field_id=None):
     return r2f, n_full
 
 
+def _coarse_reduced_map(solver, coarse_mesh, field_id=None):
+    """A COARSE level's BC-constrained reduced map, with NO throwaway solver.
+
+    Clone the coarse mesh DM and copy the (built) finest ``solver``'s fields + DS
+    onto it. The DS carries UW's exact essential-BC definitions (the custom
+    essential-field boundaries), and is a topology-independent discretisation spec,
+    so ``createDS`` constrains the matching boundary DOFs on ANY coarse mesh that
+    carries the same boundary labels (nested or not). The resulting global section
+    gives the same reduced map a full same-discretisation solver would — validated
+    identical to the old ``level_solver_factory`` path. Leak-free: DM ops only, no
+    SNES / JIT.
+
+    NOTE: serial, like ``_reduced_map`` (uses local indices); parallel correctness
+    is a Phase-3 item."""
+    cdm = coarse_mesh.dm.clone()
+    solver.dm.copyFields(cdm)
+    solver.dm.copyDS(cdm)
+    cdm.createDS()
+    return _reduced_map(cdm, field_id)
+
+
 def _reduced_transfer(coarse_coords, fine_coords, r2f_c, r2f_f, ncomp, builder):
     """Build one prolongation reduced(coarse) -> reduced(fine):
     node-level scalar P -> interleave ``ncomp`` components -> drop BC rows/cols."""
@@ -395,11 +416,12 @@ def __init__(self, level_meshes, builder="barycentric", field_id=None):
         self.field_id = field_id
         self.transfers = None
 
-    def build(self, solver, level_solver_factory):
+    def build(self, solver):
         """Build the BC-reduced prolongations. ``solver`` is the (built) finest
-        solver; ``level_solver_factory(mesh) -> built solver`` provides a
-        same-discretisation solver on each COARSE level so its constrained
-        section gives that level's reduced map. The finest level uses ``solver``."""
+        solver. Each COARSE level's BC-constrained reduced map is derived directly
+        from the coarse mesh DM by copying the finest solver's fields + DS onto it
+        (``_coarse_reduced_map``) — no throwaway solver. The finest level reads its
+        map from ``solver.dm``."""
         var = solver.Unknowns.u
         degree = var.degree
         continuous = getattr(var, "continuous", True)
@@ -411,10 +433,7 @@ def build(self, solver, level_solver_factory):
             if k == nlev - 1:
                 rmap, nfull = _reduced_map(solver.dm, self.field_id)
             else:
-                tmp = level_solver_factory(mesh)
-                tmp._build(False, False, None)
-                tmp.snes.setUp()
-                rmap, nfull = _reduced_map(tmp.dm, self.field_id)
+                rmap, nfull = _coarse_reduced_map(solver, mesh, self.field_id)
             nc = nfull // c.shape[0]
             if nfull % c.shape[0] != 0:
                 raise RuntimeError(
@@ -448,20 +467,21 @@ def install(self, solver, verbose=False):
 # --------------------------------------------------------------------------- #
 #  Entry points
 # --------------------------------------------------------------------------- #
-def set_custom_fmg(solver, coarse_meshes, *, level_solver_factory,
-                   builder="barycentric", field_id=None, verbose=False):
+def set_custom_fmg(solver, coarse_meshes, *, builder="barycentric",
+                   field_id=None, verbose=False):
     """Generalized custom-P FMG with BC-per-level reduction (the correct path).
 
     Registers a :class:`CustomMGHierarchy` on the solver so that the next
     ``solve()`` builds and installs it (build-time injection). The hierarchy is
-    ``[*coarse_meshes, solver.mesh]``; ``level_solver_factory(mesh)`` must return
-    a same-discretisation solver on a coarse level (used only to read its
-    BC-constrained section)."""
+    ``[*coarse_meshes, solver.mesh]``; each coarse level's BC-constrained reduced
+    map is derived directly from its DM by copying the solver's fields + DS
+    (``_coarse_reduced_map``), so ``coarse_meshes`` need only carry the same
+    boundary labels as the solver's mesh. For a saddle-point (Stokes) solver pass
+    ``field_id=0`` to target the velocity sub-block."""
     solver._custom_mg = {
         "mode": "hierarchy",
         "hierarchy": CustomMGHierarchy(list(coarse_meshes) + [solver.mesh],
                                        builder=builder, field_id=field_id),
-        "factory": level_solver_factory,
         "verbose": verbose,
     }
     solver.is_setup = False
@@ -477,7 +497,7 @@ def inject_custom_mg(solver):
 
     if isinstance(cfg, dict) and cfg.get("mode") == "hierarchy":
         h = cfg["hierarchy"]
-        h.build(solver, cfg["factory"])
+        h.build(solver)
         h.install(solver, verbose=cfg.get("verbose", False))
         return
 
diff --git a/tests/test_1016_custom_mg_hierarchy.py b/tests/test_1016_custom_mg_hierarchy.py
index 60c873b2..402ee9c4 100644
--- a/tests/test_1016_custom_mg_hierarchy.py
+++ b/tests/test_1016_custom_mg_hierarchy.py
@@ -29,7 +29,7 @@ def _hierarchy():
 def test_custom_fmg_hierarchy_converges():
     m0, coarse, fine = _hierarchy()
     s=_poisson(fine)
-    custom_mg.set_custom_fmg(s, coarse, level_solver_factory=_poisson, builder="barycentric")
+    custom_mg.set_custom_fmg(s, coarse, builder="barycentric")
     s.solve()
     assert s.snes.getKSP().getPC().getType()=="mg"
     assert s.snes.getKSP().getPC().getMGLevels()==3
@@ -44,7 +44,7 @@ def test_bc_per_level_reduction_no_zero_columns():
     m0, coarse, fine = _hierarchy()
     s=_poisson(fine); s._build(False,False,None); s.snes.setUp()
     h=custom_mg.CustomMGHierarchy(coarse+[fine], builder="barycentric")
-    Ps=h.build(s, _poisson)                # raises if any transfer has zero columns
+    Ps=h.build(s)                          # raises if any transfer has zero columns
     assert len(Ps)==2
     for P in Ps:
         Pc=P.getValuesCSR(); import scipy.sparse as sp
@@ -54,7 +54,7 @@ def test_bc_per_level_reduction_no_zero_columns():
 def test_rbf_builder_also_works():
     m0, coarse, fine = _hierarchy()
     s=_poisson(fine)
-    custom_mg.set_custom_fmg(s, coarse, level_solver_factory=_poisson, builder="rbf")
+    custom_mg.set_custom_fmg(s, coarse, builder="rbf")
     s.solve()
     assert s.snes.getKSP().getPC().getType()=="mg"
     assert s.snes.getConvergedReason()>0
diff --git a/tests/test_1017_custom_mg_stokes.py b/tests/test_1017_custom_mg_stokes.py
index 5abe7b42..3ee4954e 100644
--- a/tests/test_1017_custom_mg_stokes.py
+++ b/tests/test_1017_custom_mg_stokes.py
@@ -46,16 +46,6 @@ def _walls(s):
     s.petsc_options["snes_type"] = "ksponly"
 
 
-def _coarse_factory(mesh):
-    """Same-discretisation coarse-level Stokes (only its constrained section is
-    read by CustomMGHierarchy.build to derive the BC-reduced DOF map)."""
-    s = uw.systems.Stokes(mesh)
-    s.constitutive_model = uw.constitutive_models.ViscousFlowModel
-    s.constitutive_model.Parameters.shear_viscosity_0 = 1.0
-    _walls(s)
-    return s
-
-
 def _solcx(mesh, eta_B=1e6):
     sol = A.SolCx(mesh, eta_A=1.0, eta_B=eta_B, x_c=0.5, n=1)
     s = uw.systems.Stokes(mesh)
@@ -86,8 +76,7 @@ def test_custom_fmg_velocity_block_barycentric():
 
     # custom-P geometric MG on the velocity block
     s, sol = _solcx(fine)
-    custom_mg.set_custom_fmg(s, coarse, level_solver_factory=_coarse_factory,
-                             builder="barycentric", field_id=0)
+    custom_mg.set_custom_fmg(s, coarse, builder="barycentric", field_id=0)
     s.solve()
     vksp = _vel_ksp(s)
 
@@ -108,8 +97,7 @@ def test_custom_fmg_velocity_block_rbf():
     """The RBF builder must also drive a converging velocity-block MG solve."""
     coarse, fine = _hierarchy()
     s, sol = _solcx(fine)
-    custom_mg.set_custom_fmg(s, coarse, level_solver_factory=_coarse_factory,
-                             builder="rbf", field_id=0)
+    custom_mg.set_custom_fmg(s, coarse, builder="rbf", field_id=0)
     s.solve()
     vksp = _vel_ksp(s)
     assert s.snes.getConvergedReason() > 0

From 2e2c4f942ddc33b467e74ea73a2f4e228cc95df7 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 13:37:48 +1000
Subject: [PATCH 07/15] docs(custom_mg): mark factory removal done (Phase 1,
 Step 2)

Underworld development team with AI support from Claude Code
---
 .../GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md    | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
index 19bbc72b..c64f396e 100644
--- a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
+++ b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
@@ -126,9 +126,8 @@ Landed (committed, tested — `test_1014/1015/1016`, 18 pass):
 - validated: scalar jump-coeff Poisson, 5-level (3 uniform + 2 SBR), 3 FMG iters
   vs GAMG 46.
 
-**Current scope = experimental:** **serial**, and `set_custom_fmg` takes a
-`level_solver_factory`. Scalar / single-field-vector **and** Stokes velocity-block
-are now supported.
+**Current scope = experimental:** **serial**. Scalar / single-field-vector **and**
+Stokes velocity-block are supported; the throwaway-solver factory has been removed.
 
 Remaining hardening **before** this is a merge-ready general feature (in order):
 1. ~~**Stokes / saddle-point** (velocity-block injection).~~ **DONE** (2026-06-29).
@@ -146,10 +145,16 @@ Remaining hardening **before** this is a merge-ready general feature (in order):
    rigid-body modes on `A_vv` + coarse ops are **not yet** handled (reusing
    `_attach_stokes_nullspace` covers the SolCx pressure-nullspace case only) — a
    Phase-2.5 follow-up.
-2. **Drop the factory** — derive per-level constrained DOFs from boundary labels +
-   BC components (no throwaway solver) for a clean API.
-3. **Parallel (np>1)** — `_reduced_map` is serial; validate co-located rank-local
-   transfers (fast-follow).
+2. ~~**Drop the factory**.~~ **DONE** (2026-06-29). Each coarse level's
+   BC-constrained reduced map is derived directly from its DM via
+   `_coarse_reduced_map`: clone the coarse DM, `copyFields` + `copyDS` from the
+   finest solver, `createDS`. The DS carries UW's exact essential-BC definitions
+   and is topology-independent, so it constrains any coarse mesh sharing the
+   solver's boundary labels — validated byte-identical to the old factory path,
+   leak-free (no SNES / JIT). `set_custom_fmg` / `build` no longer take a
+   `level_solver_factory`.
+3. **Parallel (np>1)** — `_reduced_map` / `_coarse_reduced_map` are serial (local
+   indices); validate co-located rank-local transfers (fast-follow).
 
 ## Explicit non-goals for this pass
 - Knockout (shown to pay full-fine assembly; structural value only) — not pursued now.

From ecabfed08617ffc99e00a235fcaafbfd26f39d95 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 13:43:23 +1000
Subject: [PATCH 08/15] feat(custom_mg): loud serial-only guard for np>1 (Phase
 1, Step 3 partial)

The reduced maps use rank-local DOF indices and the prolongations assemble
as serial AIJ, so at np>1 custom-P would silently build wrong P (or hit a
cryptic broadcast error). Add _require_serial: set_custom_fmg and
inject_custom_mg now raise a clear NotImplementedError on >1 ranks,
pointing to preconditioner='fmg'/'gamg'. Serial behaviour unchanged.

Parallel test tests/parallel/test_1017_custom_mg_serial_guard_mpi.py
(np=2) asserts both the set-time and legacy solve-time guards fire.

Full parallel custom-P (nested co-partitioned, rank-local point location +
ghosting, MPIAIJ assembly with global-section reduction) remains a
designed fast-follow.

Underworld development team with AI support from Claude Code
---
 src/underworld3/utilities/custom_mg.py        | 16 ++++++
 .../test_1017_custom_mg_serial_guard_mpi.py   | 52 +++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 tests/parallel/test_1017_custom_mg_serial_guard_mpi.py

diff --git a/src/underworld3/utilities/custom_mg.py b/src/underworld3/utilities/custom_mg.py
index 611638f6..11c4a122 100644
--- a/src/underworld3/utilities/custom_mg.py
+++ b/src/underworld3/utilities/custom_mg.py
@@ -103,6 +103,20 @@ def phi(r):
 _BUILDERS = {"barycentric": barycentric_prolongation, "rbf": rbf_prolongation}
 
 
+def _require_serial(where):
+    """Custom-P transfers are serial-only (experimental). The reduced maps use
+    rank-local DOF indices and the prolongations assemble as serial AIJ; at np>1
+    they would silently build wrong P / mis-numbered transfers. Parallel support
+    (nested co-partitioned, rank-local P + MPIAIJ, global-section reduction) is a
+    designed fast-follow — until then, fail loudly rather than wrong."""
+    from underworld3 import mpi
+    if mpi.size > 1:
+        raise NotImplementedError(
+            f"custom_mg ({where}): custom-P geometric MG is serial-only "
+            f"(running on {mpi.size} ranks). Parallel (np>1) support is not yet "
+            f"implemented; use preconditioner='fmg' (nested) or 'gamg' in parallel.")
+
+
 # --------------------------------------------------------------------------- #
 #  Local Skeleton-Based Refinement (no MMG; on-rank; conforming)
 # --------------------------------------------------------------------------- #
@@ -478,6 +492,7 @@ def set_custom_fmg(solver, coarse_meshes, *, builder="barycentric",
     (``_coarse_reduced_map``), so ``coarse_meshes`` need only carry the same
     boundary labels as the solver's mesh. For a saddle-point (Stokes) solver pass
     ``field_id=0`` to target the velocity sub-block."""
+    _require_serial("set_custom_fmg")
     solver._custom_mg = {
         "mode": "hierarchy",
         "hierarchy": CustomMGHierarchy(list(coarse_meshes) + [solver.mesh],
@@ -493,6 +508,7 @@ def inject_custom_mg(solver):
     - ``mode == "hierarchy"`` -> BC-per-level reduced path (correct, general);
     - legacy dict ``{coarse_meshes, kind}`` -> finest-only reduction (kept for
       back-compat; valid only when coarse levels are non-nested / unconstrained)."""
+    _require_serial("inject_custom_mg")
     cfg = solver._custom_mg
 
     if isinstance(cfg, dict) and cfg.get("mode") == "hierarchy":
diff --git a/tests/parallel/test_1017_custom_mg_serial_guard_mpi.py b/tests/parallel/test_1017_custom_mg_serial_guard_mpi.py
new file mode 100644
index 00000000..f9862e65
--- /dev/null
+++ b/tests/parallel/test_1017_custom_mg_serial_guard_mpi.py
@@ -0,0 +1,52 @@
+"""Parallel guard for custom-P geometric MG (custom_mg).
+
+Custom-P transfers are serial-only (experimental): the reduced maps use rank-local
+DOF indices and the prolongations assemble as serial AIJ, so at np>1 they would
+silently build wrong P. set_custom_fmg / inject must therefore FAIL LOUDLY in
+parallel rather than produce incorrect results. Parallel (np>1) support is a
+designed fast-follow (nested co-partitioned, rank-local P + MPIAIJ).
+
+Run:
+    mpirun -n 2 python -m pytest --with-mpi tests/parallel/test_1017_custom_mg_serial_guard_mpi.py
+"""
+import pytest
+import sympy
+import underworld3 as uw
+from underworld3.utilities import custom_mg
+
+pytestmark = [pytest.mark.mpi(min_size=2), pytest.mark.timeout(120)]
+
+
+def _box(cellSize, refinement=None):
+    return uw.meshing.UnstructuredSimplexBox(
+        minCoords=(0.0, 0.0), maxCoords=(1.0, 1.0),
+        cellSize=cellSize, refinement=refinement, qdegree=2)
+
+
+def _poisson(mesh):
+    p = uw.systems.Poisson(mesh)
+    p.constitutive_model = uw.constitutive_models.DiffusionModel
+    p.constitutive_model.Parameters.diffusivity = 1
+    p.f = 0.0
+    p.add_dirichlet_bc(0.0, "Bottom"); p.add_dirichlet_bc(1.0, "Top")
+    return p
+
+
+@pytest.mark.mpi(min_size=2)
+def test_set_custom_fmg_raises_in_parallel():
+    """set_custom_fmg must raise NotImplementedError (not silently mis-build) np>1."""
+    assert uw.mpi.size > 1
+    s = _poisson(_box(0.25, refinement=2))
+    with pytest.raises(NotImplementedError):
+        custom_mg.set_custom_fmg(s, [_box(0.285), _box(0.142)], builder="barycentric")
+
+
+@pytest.mark.mpi(min_size=2)
+def test_inject_guard_blocks_legacy_path_in_parallel():
+    """The legacy set_custom_mg path must also fail loudly at solve() in parallel."""
+    assert uw.mpi.size > 1
+    s = _poisson(_box(0.25, refinement=2))
+    # legacy setter writes _custom_mg directly (no module guard); inject must catch it
+    s.set_custom_mg([_box(0.285), _box(0.142)], kind="barycentric")
+    with pytest.raises(NotImplementedError):
+        s.solve()

From caff202014876e8bfb53b8f1d94a5a80ef4068de Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 13:43:42 +1000
Subject: [PATCH 09/15] docs(custom_mg): record np>1 serial guard (Phase 1,
 Step 3)

Underworld development team with AI support from Claude Code
---
 .../design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md         | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
index c64f396e..901ff5c9 100644
--- a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
+++ b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
@@ -154,7 +154,13 @@ Remaining hardening **before** this is a merge-ready general feature (in order):
    leak-free (no SNES / JIT). `set_custom_fmg` / `build` no longer take a
    `level_solver_factory`.
 3. **Parallel (np>1)** — `_reduced_map` / `_coarse_reduced_map` are serial (local
-   indices); validate co-located rank-local transfers (fast-follow).
+   indices) and `P` assembles as serial AIJ. **Guarded** (2026-06-29):
+   `set_custom_fmg` / `inject_custom_mg` raise a clear `NotImplementedError` at
+   np>1 (`_require_serial`) so parallel cannot silently mis-build — test
+   `tests/parallel/test_1017_custom_mg_serial_guard_mpi.py`. The full
+   parallel-correct implementation (nested co-partitioned, rank-local point
+   location + ghost layer, MPIAIJ assembly with global-section reduction) remains
+   the designed fast-follow.
 
 ## Explicit non-goals for this pass
 - Knockout (shown to pay full-fine assembly; structural value only) — not pursued now.

From 19b778c783aaa590f237e9fa1ae885500311a6d9 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 14:03:51 +1000
Subject: [PATCH 10/15] feat(custom_mg): parallel (np>1) custom-P FMG, nested
 co-partitioned (Phase 1, Step 3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hierarchy path now builds parallel-correct transfers. Each rank builds
its block of P rank-locally: ghost-inclusive coarse coords mean every owned
fine node lands in a local coarse simplex (verified 0 misses np=2/4), and
the reduced global numbering rides the DM global section via a clean
local->global-reduced map (_level_dof_layout scatters owned global indices
out through globalToLocal; constrained DOFs stay -1, ghosts resolve to the
owner's global index). Transfers assemble as MPIAIJ (owned fine rows, global
coarse cols incl. off-rank); constrained coarse DOFs drop -> reduced->reduced.
Parallel zero-column guard via P^T·1 + allreduce.

_coarse_dof_layout reuses the leak-free copyDS trick for coarse levels.
CustomMGHierarchy.build branches serial (scipy CSR) vs parallel (MPIAIJ).
The guard now blocks only the legacy finest-only path (still serial).

Validated np=1/2/4: scalar Poisson custom-P 4 iters and Stokes SolCx
velocity block 6 iters, both matching the GAMG reference (and each other
across rank counts). New tests/parallel/test_1017_custom_mg_parallel_mpi.py
(scalar + Stokes correctness + legacy serial-guard). Serial 20 unchanged.

Underworld development team with AI support from Claude Code
---
 src/underworld3/utilities/custom_mg.py        | 150 +++++++++++++++---
 .../test_1017_custom_mg_parallel_mpi.py       | 113 +++++++++++++
 .../test_1017_custom_mg_serial_guard_mpi.py   |  52 ------
 3 files changed, 243 insertions(+), 72 deletions(-)
 create mode 100644 tests/parallel/test_1017_custom_mg_parallel_mpi.py
 delete mode 100644 tests/parallel/test_1017_custom_mg_serial_guard_mpi.py

diff --git a/src/underworld3/utilities/custom_mg.py b/src/underworld3/utilities/custom_mg.py
index 11c4a122..f8863940 100644
--- a/src/underworld3/utilities/custom_mg.py
+++ b/src/underworld3/utilities/custom_mg.py
@@ -278,6 +278,99 @@ def _reduced_transfer(coarse_coords, fine_coords, r2f_c, r2f_f, ncomp, builder):
     return Pr
 
 
+# --------------------------------------------------------------------------- #
+#  Parallel (np>1) — nested co-partitioned, rank-local P + MPIAIJ
+#
+#  Supported path: the levels are co-partitioned (uniform refine() / on-rank SBR
+#  keep a fine cell's parent coarse cell on the same rank), so each rank builds
+#  its block of P from its LOCAL (ghost-inclusive) coarse coords — point-location
+#  is rank-local. The reduced global numbering rides the DM global section.
+# --------------------------------------------------------------------------- #
+def _level_dof_layout(dm, field_id=None):
+    """Parallel DOF layout for one level: ``(l2g, rstart, rend, n_full)``.
+
+    ``l2g[i]`` is the GLOBAL reduced index of local DOF ``i`` — ghost-resolved to
+    the owner's global index, and ``-1`` for a BC-constrained DOF. Built by
+    scattering each owned global index out to the local (incl. ghost) layout via
+    ``globalToLocal`` (constrained local DOFs have no global source, so they keep
+    the pre-set ``-1``). ``[rstart, rend)`` is this rank's owned global range."""
+    sub = _field_subdm(dm, field_id)
+    gv = sub.getGlobalVec()
+    lv = sub.getLocalVec()
+    rstart, rend = gv.getOwnershipRange()
+    gv.array[:] = np.arange(rstart, rend, dtype=float)
+    lv.set(-1.0)
+    sub.globalToLocal(gv, lv, addv=PETSc.InsertMode.INSERT_VALUES)
+    l2g = np.rint(lv.array).astype(np.int64).copy()
+    n_full = lv.getLocalSize()
+    sub.restoreGlobalVec(gv)
+    sub.restoreLocalVec(lv)
+    return l2g, rstart, rend, n_full
+
+
+def _coarse_dof_layout(solver, coarse_mesh, field_id=None):
+    """Parallel coarse-level DOF layout, no throwaway solver — same copyDS trick
+    as :func:`_coarse_reduced_map` but returning the parallel ``l2g`` layout."""
+    cdm = coarse_mesh.dm.clone()
+    solver.dm.copyFields(cdm)
+    solver.dm.copyDS(cdm)
+    cdm.createDS()
+    return _level_dof_layout(cdm, field_id)
+
+
+def _build_parallel_transfer(cc, fc, lay_c, lay_f, ncomp, builder, comm):
+    """One reduced->reduced prolongation as an MPIAIJ matrix.
+
+    Node-level barycentric/RBF weights are built rank-locally (coarse LOCAL coords
+    incl. ghosts -> every owned fine node lands in a local coarse simplex). Each
+    fine OWNED DOF becomes a global row; its coarse contributions map through the
+    coarse ``l2g`` to global columns (off-rank columns are fine for MPIAIJ).
+    Constrained coarse DOFs (``l2g == -1``) drop out -> reduced->reduced."""
+    l2g_c, cstart, cend, _ = lay_c
+    l2g_f, fstart, fend, _ = lay_f
+    Pn = builder(cc, fc).tocsr()                 # (n_f_nodes, n_c_nodes), local
+    nloc_f = fend - fstart
+    nloc_c = cend - cstart
+
+    P = PETSc.Mat().create(comm=comm)
+    P.setSizes(((nloc_f, None), (nloc_c, None)))
+    P.setType("aij")
+    P.setUp()
+    for i in range(Pn.shape[0]):                 # fine local node
+        js = Pn.indices[Pn.indptr[i]:Pn.indptr[i + 1]]
+        ws = Pn.data[Pn.indptr[i]:Pn.indptr[i + 1]]
+        for c in range(ncomp):
+            grow = int(l2g_f[i * ncomp + c])
+            if grow < fstart or grow >= fend:    # set OWNED rows only
+                continue
+            gcols, vals = [], []
+            for jj, w in zip(js.tolist(), ws.tolist()):
+                gcol = int(l2g_c[jj * ncomp + c])
+                if gcol >= 0:                    # skip constrained coarse DOFs
+                    gcols.append(gcol)
+                    vals.append(w)
+            if gcols:
+                P.setValues([grow], gcols, vals, addv=PETSc.InsertMode.INSERT_VALUES)
+    P.assemble()
+    return P
+
+
+def _assert_no_zero_columns_parallel(P, comm):
+    """Parallel zero-column guard: a coarse DOF with no fine image -> singular
+    Galerkin coarse operator. Column sums via P^T·1 (weights are positive, so a
+    zero sum means an empty column)."""
+    ones_f = P.createVecLeft(); ones_f.set(1.0)
+    colsum = P.createVecRight()
+    P.multTranspose(ones_f, colsum)
+    nzero_local = int((colsum.array == 0.0).sum())
+    nzero = comm.tompi4py().allreduce(nzero_local)
+    ones_f.destroy(); colsum.destroy()
+    if nzero:
+        raise RuntimeError(
+            f"parallel transfer has {nzero} zero columns (coarse DOFs with no fine "
+            f"image) — BC-per-level reduction failed; coarse operator would be singular.")
+
+
 def _configure_pcmg(pc, Ps):
     """Reconfigure ``pc`` as a fresh PCMG (FMG F-cycle) driven by the supplied
     reduced->reduced prolongations ``Ps``, Galerkin RAP for coarse operators.
@@ -434,25 +527,36 @@ def build(self, solver):
         """Build the BC-reduced prolongations. ``solver`` is the (built) finest
         solver. Each COARSE level's BC-constrained reduced map is derived directly
         from the coarse mesh DM by copying the finest solver's fields + DS onto it
-        (``_coarse_reduced_map``) — no throwaway solver. The finest level reads its
-        map from ``solver.dm``."""
+        (no throwaway solver); the finest level reads its map from ``solver.dm``.
+
+        Serial: scipy reduced->reduced CSR. Parallel (np>1): rank-local node-level
+        weights assembled into MPIAIJ transfers with global-section reduced
+        numbering (nested co-partitioned path)."""
+        from underworld3 import mpi
         var = solver.Unknowns.u
         degree = var.degree
         continuous = getattr(var, "continuous", True)
         nlev = len(self.level_meshes)
+        parallel = mpi.size > 1
 
-        coords, r2f, ncomp = [], [], []
+        coords, maps, ncomp = [], [], []
         for k, mesh in enumerate(self.level_meshes):
             c = np.asarray(mesh._get_coords_for_basis(degree, continuous))
-            if k == nlev - 1:
-                rmap, nfull = _reduced_map(solver.dm, self.field_id)
+            finest = (k == nlev - 1)
+            if parallel:
+                lay = (_level_dof_layout(solver.dm, self.field_id) if finest
+                       else _coarse_dof_layout(solver, mesh, self.field_id))
+                nfull = lay[3]
+                maps.append(lay)
             else:
-                rmap, nfull = _coarse_reduced_map(solver, mesh, self.field_id)
+                rmap, nfull = (_reduced_map(solver.dm, self.field_id) if finest
+                               else _coarse_reduced_map(solver, mesh, self.field_id))
+                maps.append(rmap)
             nc = nfull // c.shape[0]
             if nfull % c.shape[0] != 0:
                 raise RuntimeError(
                     f"level {k}: full DOFs {nfull} not divisible by nodes {c.shape[0]}")
-            coords.append(c); r2f.append(rmap); ncomp.append(nc)
+            coords.append(c); ncomp.append(nc)
 
         if len(set(ncomp)) != 1:
             raise RuntimeError(f"inconsistent component counts across levels: {ncomp}")
@@ -460,15 +564,22 @@ def build(self, solver):
 
         Ps = []
         for l in range(1, nlev):
-            Pr = _reduced_transfer(coords[l - 1], coords[l], r2f[l - 1], r2f[l],
-                                   nc, self.builder)
-            zc = int((np.asarray((Pr != 0).sum(axis=0)).ravel() == 0).sum())
-            if zc:
-                raise RuntimeError(
-                    f"transfer {l-1}->{l} has {zc} zero columns (coarse DOFs with no "
-                    f"fine image) — BC-per-level reduction failed; coarse operator "
-                    f"would be singular.")
-            Ps.append(_to_petsc_aij(Pr))
+            if parallel:
+                P = _build_parallel_transfer(coords[l - 1], coords[l],
+                                             maps[l - 1], maps[l], nc,
+                                             self.builder, solver.dm.comm)
+                _assert_no_zero_columns_parallel(P, solver.dm.comm)
+                Ps.append(P)
+            else:
+                Pr = _reduced_transfer(coords[l - 1], coords[l], maps[l - 1],
+                                       maps[l], nc, self.builder)
+                zc = int((np.asarray((Pr != 0).sum(axis=0)).ravel() == 0).sum())
+                if zc:
+                    raise RuntimeError(
+                        f"transfer {l-1}->{l} has {zc} zero columns (coarse DOFs with "
+                        f"no fine image) — BC-per-level reduction failed; coarse "
+                        f"operator would be singular.")
+                Ps.append(_to_petsc_aij(Pr))
         self.transfers = Ps
         return Ps
 
@@ -492,7 +603,6 @@ def set_custom_fmg(solver, coarse_meshes, *, builder="barycentric",
     (``_coarse_reduced_map``), so ``coarse_meshes`` need only carry the same
     boundary labels as the solver's mesh. For a saddle-point (Stokes) solver pass
     ``field_id=0`` to target the velocity sub-block."""
-    _require_serial("set_custom_fmg")
     solver._custom_mg = {
         "mode": "hierarchy",
         "hierarchy": CustomMGHierarchy(list(coarse_meshes) + [solver.mesh],
@@ -508,16 +618,16 @@ def inject_custom_mg(solver):
     - ``mode == "hierarchy"`` -> BC-per-level reduced path (correct, general);
     - legacy dict ``{coarse_meshes, kind}`` -> finest-only reduction (kept for
       back-compat; valid only when coarse levels are non-nested / unconstrained)."""
-    _require_serial("inject_custom_mg")
     cfg = solver._custom_mg
 
     if isinstance(cfg, dict) and cfg.get("mode") == "hierarchy":
         h = cfg["hierarchy"]
-        h.build(solver)
+        h.build(solver)              # parallel-capable (nested co-partitioned)
         h.install(solver, verbose=cfg.get("verbose", False))
         return
 
-    # ---- legacy finest-only path (back-compat) ------------------------------
+    # ---- legacy finest-only path (back-compat, serial only) -----------------
+    _require_serial("legacy custom_mg (set_custom_mg)")
     coarse_meshes = cfg["coarse_meshes"]
     builder = _BUILDERS[cfg["kind"]]
     var = solver.Unknowns.u
diff --git a/tests/parallel/test_1017_custom_mg_parallel_mpi.py b/tests/parallel/test_1017_custom_mg_parallel_mpi.py
new file mode 100644
index 00000000..e57f2c3b
--- /dev/null
+++ b/tests/parallel/test_1017_custom_mg_parallel_mpi.py
@@ -0,0 +1,113 @@
+"""Parallel (np>1) correctness for custom-P geometric MG (custom_mg).
+
+The nested co-partitioned hierarchy path is parallel-capable: rank-local point
+location (ghost-inclusive coarse coords) + global-section reduced numbering +
+MPIAIJ transfers. These tests assert that, on >1 ranks, custom-P drives a
+converging geometric-MG solve whose result matches a GAMG reference — for both a
+scalar Poisson and the Stokes (SolCx) velocity block. The legacy finest-only
+path remains serial-only and must still fail loudly.
+
+Run:
+    mpirun -n 2 python -m pytest --with-mpi tests/parallel/test_1017_custom_mg_parallel_mpi.py
+"""
+import numpy as np
+import pytest
+import underworld3 as uw
+from underworld3.function import analytic as A
+from underworld3.utilities import custom_mg
+
+pytestmark = [pytest.mark.mpi(min_size=2), pytest.mark.timeout(180)]
+
+
+def _wrap(dm, m0, q=2):
+    return uw.discretisation.Mesh(
+        dm.clone(), simplex=True,
+        coordinate_system_type=m0.CoordinateSystem.coordinate_type,
+        qdegree=q, boundaries=m0.boundaries)
+
+
+def _box(cellSize, refinement=None, q=2):
+    return uw.meshing.UnstructuredSimplexBox(
+        minCoords=(0., 0.), maxCoords=(1., 1.),
+        cellSize=cellSize, regular=True, refinement=refinement, qdegree=q)
+
+
+def _rel_l2(a, b):
+    """Global relative L2 difference of two field-data arrays (rank-reduced)."""
+    da = np.asarray(a) - np.asarray(b)
+    num = uw.mpi.comm.allreduce(float(np.sum(da**2)))
+    den = uw.mpi.comm.allreduce(float(np.sum(np.asarray(b)**2)))
+    return (num**0.5) / (den**0.5 + 1e-30)
+
+
+def _poisson(mesh):
+    p = uw.systems.Poisson(mesh)
+    p.constitutive_model = uw.constitutive_models.DiffusionModel
+    p.constitutive_model.Parameters.diffusivity = 1
+    p.f = 0.0
+    p.add_dirichlet_bc(0.0, "Bottom"); p.add_dirichlet_bc(1.0, "Top")
+    p.petsc_options["ksp_rtol"] = 1e-8; p.petsc_options["ksp_type"] = "cg"
+    return p
+
+
+@pytest.mark.mpi(min_size=2)
+def test_parallel_custom_fmg_scalar():
+    """Scalar custom-P drives a converging geometric MG solve in parallel and
+    matches a GAMG solve of the same problem."""
+    assert uw.mpi.size > 1
+    m0 = _box(0.25); dm1 = m0.dm.refine(); dm2 = dm1.refine()
+    coarse = [_wrap(m0.dm, m0), _wrap(dm1, m0)]; fine = _wrap(dm2, m0)
+
+    g = _poisson(_wrap(dm2, m0)); g.preconditioner = "gamg"; g.solve()
+    c = _poisson(fine)
+    custom_mg.set_custom_fmg(c, coarse, builder="barycentric")
+    c.solve()
+
+    assert c.snes.getKSP().getPC().getType() == "mg"
+    assert c.snes.getConvergedReason() > 0
+    assert _rel_l2(c.Unknowns.u.data, g.Unknowns.u.data) < 1e-4
+
+
+@pytest.mark.mpi(min_size=2)
+def test_parallel_custom_fmg_stokes_velocity_block():
+    """Custom-P on the SolCx velocity block converges in few MG iters in parallel
+    and matches the analytic velocity to the GAMG reference's accuracy."""
+    assert uw.mpi.size > 1
+    fine = _box(0.25, refinement=2, q=3)
+    coarse = [_wrap(d, fine, q=3) for d in fine.dm_hierarchy[:-1]]
+
+    def solcx(mesh):
+        sol = A.SolCx(mesh, eta_A=1.0, eta_B=1e6, x_c=0.5, n=1)
+        s = uw.systems.Stokes(mesh)
+        s.constitutive_model = uw.constitutive_models.ViscousFlowModel
+        s.constitutive_model.Parameters.shear_viscosity_0 = sol.fn_viscosity
+        s.saddle_preconditioner = 1.0 / sol.fn_viscosity
+        s.bodyforce = sol.fn_bodyforce
+        s.add_dirichlet_bc((0.0, None), "Left"); s.add_dirichlet_bc((0.0, None), "Right")
+        s.add_dirichlet_bc((None, 0.0), "Bottom"); s.add_dirichlet_bc((None, 0.0), "Top")
+        s.petsc_use_pressure_nullspace = True; s.tolerance = 1e-8
+        s.petsc_options["snes_type"] = "ksponly"
+        return s, sol
+
+    sg, solg = solcx(fine); sg.preconditioner = "gamg"; sg.solve()
+    sc, sol = solcx(fine)
+    custom_mg.set_custom_fmg(sc, coarse, builder="barycentric", field_id=0)
+    sc.solve()
+    vksp = sc.snes.getKSP().getPC().getFieldSplitSubKSP()[0]
+
+    assert sc.snes.getConvergedReason() > 0
+    assert vksp.getPC().getType() == "mg"
+    assert vksp.getPC().getMGLevels() == len(coarse) + 1
+    assert vksp.getIterationNumber() <= 15           # geometric MG, not GAMG (~200)
+    assert sol.velocity_error(sc.u) < 2.0 * solg.velocity_error(sg.u) + 1e-6
+
+
+@pytest.mark.mpi(min_size=2)
+def test_legacy_path_serial_guard():
+    """The legacy finest-only set_custom_mg path is serial-only and must raise
+    loudly in parallel (it has no parallel reduced map)."""
+    assert uw.mpi.size > 1
+    s = _poisson(_box(0.25, refinement=2))
+    s.set_custom_mg([_box(0.285), _box(0.142)], kind="barycentric")
+    with pytest.raises(NotImplementedError):
+        s.solve()
diff --git a/tests/parallel/test_1017_custom_mg_serial_guard_mpi.py b/tests/parallel/test_1017_custom_mg_serial_guard_mpi.py
deleted file mode 100644
index f9862e65..00000000
--- a/tests/parallel/test_1017_custom_mg_serial_guard_mpi.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""Parallel guard for custom-P geometric MG (custom_mg).
-
-Custom-P transfers are serial-only (experimental): the reduced maps use rank-local
-DOF indices and the prolongations assemble as serial AIJ, so at np>1 they would
-silently build wrong P. set_custom_fmg / inject must therefore FAIL LOUDLY in
-parallel rather than produce incorrect results. Parallel (np>1) support is a
-designed fast-follow (nested co-partitioned, rank-local P + MPIAIJ).
-
-Run:
-    mpirun -n 2 python -m pytest --with-mpi tests/parallel/test_1017_custom_mg_serial_guard_mpi.py
-"""
-import pytest
-import sympy
-import underworld3 as uw
-from underworld3.utilities import custom_mg
-
-pytestmark = [pytest.mark.mpi(min_size=2), pytest.mark.timeout(120)]
-
-
-def _box(cellSize, refinement=None):
-    return uw.meshing.UnstructuredSimplexBox(
-        minCoords=(0.0, 0.0), maxCoords=(1.0, 1.0),
-        cellSize=cellSize, refinement=refinement, qdegree=2)
-
-
-def _poisson(mesh):
-    p = uw.systems.Poisson(mesh)
-    p.constitutive_model = uw.constitutive_models.DiffusionModel
-    p.constitutive_model.Parameters.diffusivity = 1
-    p.f = 0.0
-    p.add_dirichlet_bc(0.0, "Bottom"); p.add_dirichlet_bc(1.0, "Top")
-    return p
-
-
-@pytest.mark.mpi(min_size=2)
-def test_set_custom_fmg_raises_in_parallel():
-    """set_custom_fmg must raise NotImplementedError (not silently mis-build) np>1."""
-    assert uw.mpi.size > 1
-    s = _poisson(_box(0.25, refinement=2))
-    with pytest.raises(NotImplementedError):
-        custom_mg.set_custom_fmg(s, [_box(0.285), _box(0.142)], builder="barycentric")
-
-
-@pytest.mark.mpi(min_size=2)
-def test_inject_guard_blocks_legacy_path_in_parallel():
-    """The legacy set_custom_mg path must also fail loudly at solve() in parallel."""
-    assert uw.mpi.size > 1
-    s = _poisson(_box(0.25, refinement=2))
-    # legacy setter writes _custom_mg directly (no module guard); inject must catch it
-    s.set_custom_mg([_box(0.285), _box(0.142)], kind="barycentric")
-    with pytest.raises(NotImplementedError):
-        s.solve()

From af8e734b431da181cf8d5f208323678227937c30 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 14:16:37 +1000
Subject: [PATCH 11/15] docs(custom_mg): record parallel np>1 done (Phase 1,
 Step 3)

Underworld development team with AI support from Claude Code
---
 .../GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md    | 40 ++++++++++++-------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
index 901ff5c9..2cb670fc 100644
--- a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
+++ b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
@@ -120,16 +120,21 @@ non-nested parallel transfers; knockout-as-coarsening as an alternative Layer-2
 (nested *or* non-nested, uniform *or* SBR-refined) → geometric FMG via custom-P
 with BC-per-level reduction. It does not depend on Layer 2.
 
-Landed (committed, tested — `test_1014/1015/1016`, 18 pass):
+Landed (committed, tested — `test_1014/1015/1016/1017` serial 20 +
+`tests/parallel/test_1017_custom_mg_parallel_mpi.py` np2):
 - `CustomMGHierarchy`, `set_custom_fmg`, `sbr_refine`/`sbr_refine_where`;
 - automatic BC-per-level reduction + zero-column guard;
-- validated: scalar jump-coeff Poisson, 5-level (3 uniform + 2 SBR), 3 FMG iters
-  vs GAMG 46.
+- Stokes velocity-block injection; leak-free per-level reduction (copyDS, no
+  factory); parallel (np>1) nested co-partitioned transfers;
+- validated: scalar jump-coeff Poisson 5-level (3 uniform + 2 SBR) 3 FMG iters
+  vs GAMG 46; SolCx velocity block 6 iters vs GAMG ~198, np=1/2/4.
 
-**Current scope = experimental:** **serial**. Scalar / single-field-vector **and**
-Stokes velocity-block are supported; the throwaway-solver factory has been removed.
+**Current scope:** scalar / single-field-vector **and** Stokes velocity-block;
+**serial and parallel** (nested co-partitioned). Non-nested custom-P is serial-only
+(experimental). Hardening steps 1–3 are complete; what remains is test
+tier-classification + undrafting PR #290.
 
-Remaining hardening **before** this is a merge-ready general feature (in order):
+Hardening steps (all complete as of 2026-06-29):
 1. ~~**Stokes / saddle-point** (velocity-block injection).~~ **DONE** (2026-06-29).
    `set_custom_fmg(..., field_id=0)` drives custom-P geometric MG on the velocity
    sub-block. The sub-PC is unreachable until the monolithic Jacobian is assembled,
@@ -153,14 +158,21 @@ Remaining hardening **before** this is a merge-ready general feature (in order):
    solver's boundary labels — validated byte-identical to the old factory path,
    leak-free (no SNES / JIT). `set_custom_fmg` / `build` no longer take a
    `level_solver_factory`.
-3. **Parallel (np>1)** — `_reduced_map` / `_coarse_reduced_map` are serial (local
-   indices) and `P` assembles as serial AIJ. **Guarded** (2026-06-29):
-   `set_custom_fmg` / `inject_custom_mg` raise a clear `NotImplementedError` at
-   np>1 (`_require_serial`) so parallel cannot silently mis-build — test
-   `tests/parallel/test_1017_custom_mg_serial_guard_mpi.py`. The full
-   parallel-correct implementation (nested co-partitioned, rank-local point
-   location + ghost layer, MPIAIJ assembly with global-section reduction) remains
-   the designed fast-follow.
+3. ~~**Parallel (np>1)**.~~ **DONE** (2026-06-29). The hierarchy path builds
+   parallel-correct transfers on the nested co-partitioned hierarchy: each rank
+   builds its block of `P` rank-locally (ghost-inclusive coarse coords → every
+   owned fine node lands in a local coarse simplex, verified 0 misses np=2/4),
+   the reduced global numbering rides the DM global section
+   (`_level_dof_layout` scatters owned global indices out via `globalToLocal` —
+   constrained DOFs `-1`, ghosts resolve to the owner's global), and transfers
+   assemble as MPIAIJ (owned fine rows, global coarse cols incl. off-rank;
+   constrained coarse DOFs drop → reduced→reduced). Parallel zero-column guard
+   via `Pᵀ·1` + allreduce. Validated np=1/2/4: scalar Poisson 4 iters + Stokes
+   SolCx velocity block 6 iters, matching the GAMG reference and each other
+   across rank counts (`tests/parallel/test_1017_custom_mg_parallel_mpi.py`). The
+   **legacy** finest-only path (`set_custom_mg` / `_reduce_to_global`) stays
+   serial-only and raises loudly at np>1. Non-nested custom-P in parallel remains
+   out of scope (cross-rank point location).
 
 ## Explicit non-goals for this pass
 - Knockout (shown to pay full-fine assembly; structural value only) — not pursued now.

From 3686b7b84ca2491b848a2b22335922f3d36e28cd Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 14:43:10 +1000
Subject: [PATCH 12/15] feat(custom_mg): cover all solver families (SNES_Vector
 hook; Vector + Constrained tests)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Verify custom-P works for every solver family that consumes the mesh, so a
mesh-owned adapted hierarchy drives MG uniformly:

- SNES_Vector.solve was MISSING the inject_custom_mg hook (only SNES_Scalar
  and SNES_Stokes_SaddlePt had it) — so Vector_Projection silently stayed on
  GAMG. Add the hook (mirrors the scalar one). All 8 solver-subclass solve()
  overrides delegate to a base solve(), so this completes coverage:
  scalar branch -> Poisson/Darcy/Projection/AdvDiff/Diffusion;
  vector branch -> Vector_Projection; velocity-block -> Stokes/VE/Constrained/NS.

- test_1017: add SNES_Vector (Vector_Projection, top-level vector PC,
  field_id=None) and Stokes_Constrained (free-slip multipliers, grouped [p,h]
  split, field_id=0) serial tests — both drive custom-P 'mg', constrained
  matches analytic SolCx to 4.4e-4.

- parallel test: add Vector (passes np=2) and Constrained. The Constrained
  parallel test is SKIPPED: Stokes_Constrained is not parallel-safe yet — it
  segfaults at np>1 independently of custom-P (canonical
  test_1062_constrained_solcx also segfaults at np=2 under plain GAMG). It
  auto-enables when the constrained solver becomes parallel-ready.

Serial: custom_mg 22 + projections + constrained green. Parallel np=2:
scalar + Stokes-velocity + vector + legacy-guard pass.

Underworld development team with AI support from Claude Code
---
 .../cython/petsc_generic_snes_solvers.pyx     |  6 ++
 .../test_1017_custom_mg_parallel_mpi.py       | 59 ++++++++++++++++++
 tests/test_1017_custom_mg_stokes.py           | 61 +++++++++++++++++--
 3 files changed, 121 insertions(+), 5 deletions(-)

diff --git a/src/underworld3/cython/petsc_generic_snes_solvers.pyx b/src/underworld3/cython/petsc_generic_snes_solvers.pyx
index 58f21ab1..7c680a88 100644
--- a/src/underworld3/cython/petsc_generic_snes_solvers.pyx
+++ b/src/underworld3/cython/petsc_generic_snes_solvers.pyx
@@ -3724,6 +3724,12 @@ class SNES_Vector(SolverBaseClass):
         # Update constants (e.g. changed material params) before solve
         self._update_constants()
 
+        # Custom geometric-MG prolongation on the (top-level vector) PC, if
+        # registered via set_custom_fmg. Mirrors the SNES_Scalar hook.
+        if self._custom_mg is not None:
+            from underworld3.utilities.custom_mg import inject_custom_mg
+            inject_custom_mg(self)
+
         # solve
         self._snes_solve_with_retries(gvec, divergence_retries, verbose)
 
diff --git a/tests/parallel/test_1017_custom_mg_parallel_mpi.py b/tests/parallel/test_1017_custom_mg_parallel_mpi.py
index e57f2c3b..0437a59e 100644
--- a/tests/parallel/test_1017_custom_mg_parallel_mpi.py
+++ b/tests/parallel/test_1017_custom_mg_parallel_mpi.py
@@ -102,6 +102,65 @@ def solcx(mesh):
     assert sol.velocity_error(sc.u) < 2.0 * solg.velocity_error(sg.u) + 1e-6
 
 
+@pytest.mark.mpi(min_size=2)
+def test_parallel_custom_fmg_vector():
+    """SNES_Vector (Vector_Projection) custom-P drives a top-level vector PCMG in
+    parallel (field_id=None, ncomp=dim). Fine mesh has NO native hierarchy, so the
+    'mg' PC can only come from our custom-P."""
+    import sympy
+    assert uw.mpi.size > 1
+    m0 = _box(0.25); dm2 = m0.dm.refine().refine()
+    coarse = [_wrap(m0.dm, m0), _wrap(m0.dm.refine(), m0)]
+    fine = _wrap(dm2, m0)
+    x, y = fine.X
+    v = uw.discretisation.MeshVariable("Ucpp", fine, fine.dim, degree=2)
+    proj = uw.systems.Vector_Projection(fine, v)
+    proj.uw_function = sympy.Matrix([[sympy.sin(sympy.pi * x) * sympy.cos(sympy.pi * y),
+                                      sympy.cos(sympy.pi * x) * sympy.sin(sympy.pi * y)]])
+    proj.smoothing = 1.0e-3
+    proj.add_dirichlet_bc((0.0, 0.0), "Bottom")
+    proj.add_dirichlet_bc((0.0, 0.0), "Top")
+    custom_mg.set_custom_fmg(proj, coarse, builder="barycentric")
+    proj.solve()
+    assert proj.snes.getKSP().getPC().getType() == "mg"
+    assert proj.snes.getConvergedReason() > 0
+
+
+@pytest.mark.skip(reason="Stokes_Constrained is not parallel-safe yet — it "
+                  "segfaults at np>1 independently of custom-P (the canonical "
+                  "test_1062_constrained_solcx also segfaults at np=2, plain GAMG). "
+                  "custom-P on the constrained velocity block works in SERIAL "
+                  "(see test_1017_custom_mg_stokes.test_custom_fmg_stokes_constrained); "
+                  "this auto-enables once the constrained solver is parallel-ready.")
+@pytest.mark.mpi(min_size=2)
+def test_parallel_custom_fmg_stokes_constrained():
+    """Stokes_Constrained (free-slip multipliers, grouped [p,h] split) custom-P on
+    the velocity block in parallel — velocity must match the analytic SolCx."""
+    import sympy
+    assert uw.mpi.size > 1
+    m0 = _box(0.25, q=3); dm2 = m0.dm.refine().refine()
+    coarse = [_wrap(m0.dm, m0, q=3), _wrap(m0.dm.refine(), m0, q=3)]
+    fine = _wrap(dm2, m0, q=3)
+    sol = A.SolCx(fine, eta_A=1.0, eta_B=1.0e6, x_c=0.5, n=1)
+    s = uw.systems.Stokes_Constrained(fine)
+    s.constitutive_model = uw.constitutive_models.ViscousFlowModel
+    s.constitutive_model.Parameters.shear_viscosity_0 = sol.fn_viscosity
+    s.bodyforce = sol.fn_bodyforce
+    s.add_constraint_bc("Left",   g=0.0, normal=sympy.Matrix([[-1.0, 0.0]]))
+    s.add_constraint_bc("Right",  g=0.0, normal=sympy.Matrix([[1.0, 0.0]]))
+    s.add_constraint_bc("Bottom", g=0.0, normal=sympy.Matrix([[0.0, -1.0]]))
+    s.add_constraint_bc("Top",    g=0.0, normal=sympy.Matrix([[0.0, 1.0]]))
+    s.petsc_use_pressure_nullspace = True
+    s.tolerance = 1.0e-9
+    s.petsc_options["snes_type"] = "ksponly"
+    custom_mg.set_custom_fmg(s, coarse, builder="barycentric", field_id=0)
+    s.solve()
+    vksp = s.snes.getKSP().getPC().getFieldSplitSubKSP()[0]
+    assert s.snes.getConvergedReason() > 0
+    assert vksp.getPC().getType() == "mg"
+    assert sol.velocity_error(s.u) < 5.0e-3
+
+
 @pytest.mark.mpi(min_size=2)
 def test_legacy_path_serial_guard():
     """The legacy finest-only set_custom_mg path is serial-only and must raise
diff --git a/tests/test_1017_custom_mg_stokes.py b/tests/test_1017_custom_mg_stokes.py
index 3ee4954e..5046d71d 100644
--- a/tests/test_1017_custom_mg_stokes.py
+++ b/tests/test_1017_custom_mg_stokes.py
@@ -1,9 +1,12 @@
-"""Layer-1 generalized FMG hierarchy on the STOKES velocity block (Phase-1 Step 1).
+"""Layer-1 generalized FMG hierarchy across the solver families that consume a mesh.
 
-Custom-built prolongations (barycentric / RBF) drive geometric multigrid on the
-velocity sub-block of the saddle-point solver, via set_custom_fmg(field_id=0).
-Validated on SolCx (eta_B=1e6): the velocity block converges in a handful of MG
-iterations (vs ~200 for GAMG) and the solution matches a GAMG reference.
+Custom-built prolongations (barycentric / RBF) drive geometric multigrid via
+set_custom_fmg, validated here on each solver-PC topology so the same adapted-mesh
+hierarchy works for every solver running on it:
+  - STOKES velocity sub-block (set_custom_fmg field_id=0) — SolCx eta_B=1e6;
+  - SNES_Vector (Vector_Projection) — top-level vector PC (field_id=None);
+  - Stokes_Constrained (free-slip via in-saddle multipliers) — velocity block under
+    the grouped [p, h] fieldsplit.
 
 The hard part (see custom_mg._install_velocity_block_transfers): the velocity
 sub-PC is unreachable until the monolithic Jacobian is assembled, so the install
@@ -105,3 +108,51 @@ def test_custom_fmg_velocity_block_rbf():
     assert vksp.getPC().getMGLevels() == len(coarse) + 1
     assert vksp.getIterationNumber() <= 25
     assert sol.velocity_error(s.u) < 1e-2
+
+
+def test_custom_fmg_vector_solver():
+    """SNES_Vector (Vector_Projection): custom-P on the top-level vector PC
+    (field_id=None) — the same scalar/single-field-vector path, ncomp=dim."""
+    import sympy
+    coarse, fine = _hierarchy()
+    x, y = fine.X
+    v = uw.discretisation.MeshVariable("Ucp", fine, fine.dim, degree=2)
+    proj = uw.systems.Vector_Projection(fine, v)
+    proj.uw_function = sympy.Matrix([[sympy.sin(sympy.pi * x) * sympy.cos(sympy.pi * y),
+                                      sympy.cos(sympy.pi * x) * sympy.sin(sympy.pi * y)]])
+    proj.smoothing = 1.0e-3
+    proj.add_dirichlet_bc((0.0, 0.0), "Bottom")
+    proj.add_dirichlet_bc((0.0, 0.0), "Top")
+    custom_mg.set_custom_fmg(proj, coarse, builder="barycentric")   # field_id=None
+    proj.solve()
+    assert proj.snes.getKSP().getPC().getType() == "mg"
+    assert proj.snes.getKSP().getPC().getMGLevels() == len(coarse) + 1
+    assert proj.snes.getConvergedReason() > 0
+
+
+def test_custom_fmg_stokes_constrained():
+    """Stokes_Constrained (free-slip via in-saddle multipliers): custom-P on the
+    velocity block under the grouped [p, h] fieldsplit. Velocity must still match
+    the analytic SolCx free-slip solution."""
+    import sympy
+    coarse, fine = _hierarchy()
+    sol = A.SolCx(fine, eta_A=1.0, eta_B=1.0e6, x_c=0.5, n=1)
+    s = uw.systems.Stokes_Constrained(fine)
+    s.constitutive_model = uw.constitutive_models.ViscousFlowModel
+    s.constitutive_model.Parameters.shear_viscosity_0 = sol.fn_viscosity
+    s.bodyforce = sol.fn_bodyforce
+    s.add_constraint_bc("Left",   g=0.0, normal=sympy.Matrix([[-1.0, 0.0]]))
+    s.add_constraint_bc("Right",  g=0.0, normal=sympy.Matrix([[1.0, 0.0]]))
+    s.add_constraint_bc("Bottom", g=0.0, normal=sympy.Matrix([[0.0, -1.0]]))
+    s.add_constraint_bc("Top",    g=0.0, normal=sympy.Matrix([[0.0, 1.0]]))
+    s.petsc_use_pressure_nullspace = True
+    s.tolerance = 1.0e-9
+    s.petsc_options["snes_type"] = "ksponly"
+    custom_mg.set_custom_fmg(s, coarse, builder="barycentric", field_id=0)
+    s.solve()
+    vksp = _vel_ksp(s)
+    assert s.snes.getConvergedReason() > 0
+    assert vksp.getPC().getType() == "mg"
+    assert vksp.getPC().getMGLevels() == len(coarse) + 1
+    assert vksp.getIterationNumber() <= 15
+    assert sol.velocity_error(s.u) < 5.0e-3

From 32fec5adb3584d54254d6b1f994c7acb6425e3f0 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 14:43:39 +1000
Subject: [PATCH 13/15] docs(custom_mg): solver-family coverage matrix +
 constrained-parallel limitation

Underworld development team with AI support from Claude Code
---
 .../GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
index 2cb670fc..8da85492 100644
--- a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
+++ b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
@@ -134,6 +134,23 @@ Landed (committed, tested — `test_1014/1015/1016/1017` serial 20 +
 (experimental). Hardening steps 1–3 are complete; what remains is test
 tier-classification + undrafting PR #290.
 
+**Solver-family coverage** — the install keys off the PC topology, so one of two
+branches covers every solver that consumes the mesh (all `solve()` overrides
+delegate to a hooked base `solve()`; the `inject_custom_mg` hook lives in
+`SNES_Scalar.solve`, `SNES_Vector.solve`, `SNES_Stokes_SaddlePt.solve`):
+
+| Solver family | Branch (`field_id`) | Serial | Parallel |
+|---|---|---|---|
+| `SNES_Scalar` (Poisson, Darcy, Projection, AdvDiff, Diffusion) | top-level (`None`) | ✓ | ✓ |
+| `SNES_Vector` (Vector_Projection, displacement) | top-level (`None`) | ✓ | ✓ |
+| `SNES_Stokes` / VE / NavierStokes | velocity block (`0`) | ✓ | ✓ |
+| `SNES_Stokes_Constrained` (in-saddle multipliers) | velocity block (`0`) | ✓ | **skip** ¹ |
+
+¹ The constrained solver is **not parallel-safe today** — it segfaults at np>1
+*independently of custom-P* (the canonical `test_1062_constrained_solcx` also
+segfaults at np=2 under plain GAMG). custom-P on the constrained velocity block
+works in serial; the parallel test auto-enables once the solver is parallel-ready.
+
 Hardening steps (all complete as of 2026-06-29):
 1. ~~**Stokes / saddle-point** (velocity-block injection).~~ **DONE** (2026-06-29).
    `set_custom_fmg(..., field_id=0)` drives custom-P geometric MG on the velocity

From d7a4ca17e0e78c5a233c89b008805a8ce559956b Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 15:01:36 +1000
Subject: [PATCH 14/15] docs(custom_mg): link constrained-parallel segfault to
 issue #291

Underworld development team with AI support from Claude Code
---
 .../design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md         | 8 +++++---
 tests/parallel/test_1017_custom_mg_parallel_mpi.py        | 5 +++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
index 8da85492..4380f1d0 100644
--- a/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
+++ b/docs/developer/design/GENERALIZED_FMG_HIERARCHY_AND_ADAPT.md
@@ -147,9 +147,11 @@ delegate to a hooked base `solve()`; the `inject_custom_mg` hook lives in
 | `SNES_Stokes_Constrained` (in-saddle multipliers) | velocity block (`0`) | ✓ | **skip** ¹ |
 
 ¹ The constrained solver is **not parallel-safe today** — it segfaults at np>1
-*independently of custom-P* (the canonical `test_1062_constrained_solcx` also
-segfaults at np=2 under plain GAMG). custom-P on the constrained velocity block
-works in serial; the parallel test auto-enables once the solver is parallel-ready.
+*independently of custom-P*, in the interior-multiplier section reduction
+(`_constrain_interior_multipliers_in_section`; **issue #291**; the canonical
+`test_1062_constrained_solcx` also segfaults at np=2 under plain GAMG; workaround
+`_reduce_interior_multiplier = False`). custom-P on the constrained velocity block
+works in serial; the parallel test auto-enables once #291 is fixed.
 
 Hardening steps (all complete as of 2026-06-29):
 1. ~~**Stokes / saddle-point** (velocity-block injection).~~ **DONE** (2026-06-29).
diff --git a/tests/parallel/test_1017_custom_mg_parallel_mpi.py b/tests/parallel/test_1017_custom_mg_parallel_mpi.py
index 0437a59e..3a3e29a5 100644
--- a/tests/parallel/test_1017_custom_mg_parallel_mpi.py
+++ b/tests/parallel/test_1017_custom_mg_parallel_mpi.py
@@ -127,11 +127,12 @@ def test_parallel_custom_fmg_vector():
 
 
 @pytest.mark.skip(reason="Stokes_Constrained is not parallel-safe yet — it "
-                  "segfaults at np>1 independently of custom-P (the canonical "
+                  "segfaults at np>1 independently of custom-P, in the "
+                  "interior-multiplier section reduction (issue #291; canonical "
                   "test_1062_constrained_solcx also segfaults at np=2, plain GAMG). "
                   "custom-P on the constrained velocity block works in SERIAL "
                   "(see test_1017_custom_mg_stokes.test_custom_fmg_stokes_constrained); "
-                  "this auto-enables once the constrained solver is parallel-ready.")
+                  "this auto-enables once #291 is fixed.")
 @pytest.mark.mpi(min_size=2)
 def test_parallel_custom_fmg_stokes_constrained():
     """Stokes_Constrained (free-slip multipliers, grouped [p,h] split) custom-P on

From 6d6d85ebdccbeba0a0839ad2ea2a45091b97e1e9 Mon Sep 17 00:00:00 2001
From: lmoresi <louis.moresi@anu.edu.au>
Date: Mon, 29 Jun 2026 21:09:42 +1000
Subject: [PATCH 15/15] fix(custom_mg): wire assembled Jacobian into KSP before
 velocity-block fieldsplit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Stokes velocity-block injection forces computeJacobian, then reaches the
fieldsplit BEFORE the SNES solve. SNESSolve normally wires the freshly-assembled
Jacobian into the KSP/outer-PC; doing it ourselves was missing, so for some
configurations (e.g. Stokes on an SBR-refined child mesh with a mesh-variable
bodyforce) the outer PC carried an UNASSEMBLED operator and PCSetUp failed with
"Matrix must be set first" (err73). Fix: snes.getKSP().setOperators(J, Pmat)
after forcing assembly, before outer_pc.setUp().

Found via the Layer-2 adapt-on-top prototype (Stokes on a fault-refined child).
Diagnosed: ksp.A set but asm=False while J.assembled=True -> operator not wired.

New regression test_custom_fmg_stokes_on_sbr_child (Stokes velocity-block custom-P
on an SBR child with a mesh-variable bodyforce) — fails pre-fix, passes after.
test_1014/1015/1016/1017 green (serial); parallel np2 unchanged.

Underworld development team with AI support from Claude Code
---
 src/underworld3/utilities/custom_mg.py |  6 +++++
 tests/test_1017_custom_mg_stokes.py    | 34 ++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/src/underworld3/utilities/custom_mg.py b/src/underworld3/utilities/custom_mg.py
index f8863940..32023429 100644
--- a/src/underworld3/utilities/custom_mg.py
+++ b/src/underworld3/utilities/custom_mg.py
@@ -471,6 +471,12 @@ def _install_velocity_block_transfers(solver, Ps, verbose=False):
         snes.setFromOptions()
     solver.dm.restoreGlobalVec(x0)
 
+    # Wire the freshly-assembled Jacobian into the KSP/outer-PC. SNESSolve does
+    # this lazily, but we reach the fieldsplit BEFORE the solve — without it the
+    # outer PC can carry an unassembled operator and PCSetUp fails with
+    # "Matrix must be set first" (err73) for some configurations.
+    snes.getKSP().setOperators(J, Pmat)
+
     # 2. split -> reach the velocity sub-KSP / sub-PC (field 0)
     outer_pc = snes.getKSP().getPC()
     outer_pc.setUp()
diff --git a/tests/test_1017_custom_mg_stokes.py b/tests/test_1017_custom_mg_stokes.py
index 5046d71d..bab16eb0 100644
--- a/tests/test_1017_custom_mg_stokes.py
+++ b/tests/test_1017_custom_mg_stokes.py
@@ -156,3 +156,37 @@ def test_custom_fmg_stokes_constrained():
     assert vksp.getPC().getMGLevels() == len(coarse) + 1
     assert vksp.getIterationNumber() <= 15
     assert sol.velocity_error(s.u) < 5.0e-3
+
+
+def test_custom_fmg_stokes_on_sbr_child():
+    """Stokes velocity-block custom-P on an SBR-refined CHILD mesh whose bodyforce
+    depends on a mesh variable (the Layer-2 adapt-on-top scenario). Regression for
+    the operator-not-wired bug: forcing the Jacobian before the fieldsplit left the
+    KSP carrying an unassembled operator -> PCSetUp 'Matrix must be set first'
+    (err73). Fixed by wiring the assembled Jacobian into the KSP before reaching
+    the velocity sub-PC."""
+    import sympy
+    base = uw.meshing.UnstructuredSimplexBox(
+        minCoords=(0, 0), maxCoords=(1, 1), cellSize=0.25, regular=True,
+        refinement=2, qdegree=3)
+    child = _wrap(custom_mg.sbr_refine_where(
+        base.dm_hierarchy[-1], lambda c: abs(c[0] - 0.5) < 0.15), base)
+    coarse = [_wrap(d, base) for d in base.dm_hierarchy]   # static base levels
+
+    Tp = uw.discretisation.MeshVariable("Tp", child, 1, degree=2)   # proxy field
+    Tp.data[:, 0] = 1.0
+    s = uw.systems.Stokes(child)
+    s.constitutive_model = uw.constitutive_models.ViscousFlowModel
+    s.constitutive_model.Parameters.shear_viscosity_0 = 1.0
+    s.saddle_preconditioner = 1.0
+    s.bodyforce = sympy.Matrix([[0.0], [1.0e4 * Tp.sym[0]]])   # mesh-variable bodyforce
+    s.add_dirichlet_bc((0.0, 0.0), "Bottom"); s.add_dirichlet_bc((0.0, 0.0), "Top")
+    s.add_dirichlet_bc((0.0, None), "Left"); s.add_dirichlet_bc((0.0, None), "Right")
+    s.petsc_use_pressure_nullspace = True
+    s.petsc_options["snes_type"] = "ksponly"
+    custom_mg.set_custom_fmg(s, coarse, builder="barycentric", field_id=0)
+    s.solve()
+    vksp = _vel_ksp(s)
+    assert s.snes.getConvergedReason() > 0
+    assert vksp.getPC().getType() == "mg"
+    assert vksp.getPC().getMGLevels() == len(coarse) + 1