From 981f8db960b2936eee4c0cdc20a52acc62d8a34c Mon Sep 17 00:00:00 2001 From: wenkaifan0720 Date: Fri, 26 Jun 2026 16:24:11 -0700 Subject: [PATCH 01/13] fix(osr): repair the visibility/resize/cull wedge class + host-crash texture leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root-cause fixes from a comprehensive visibility/resize/lifecycle audit (12 defects; full report in docs/OSR_VISIBILITY_RESIZE_AUDIT.md). This commit lands the keystone correctness fixes; the liveness-watchdog backstop (C-3/F-6) + create-payload visibility (C-4/F-8) are follow-ups. F-1 (keystone, main.mm DoSetVisible): force a full repaint on the hidden->visible edge. WasHidden(false) alone does not repaint, so a surface left blank/stale while hidden — by a resize that landed with the begin-frame pump gated off, a deferred dpr change, or Chromium's FrameEvictionManager reclaiming the off-screen frame past ~5 browsers — stayed permanently blank until relaunch. Now re-assert screen-info (if deferred) + size, then drive a guaranteed frame. Closes C-1b/d/e/f and the ~100ms un-hide latency (C-6). F-2 (main.mm DoResize): while hidden, keep the surface/dims swap but DEFER the paint (WasResized/begin-frame would compose into a surface nothing displays and mislead the watchdog). A deferred dpr change is flagged for F-1's un-hide repaint. F-3 (main.mm OnAfterCreated): honor a setVisible(false) that arrived before the browser bound (slot->visible already false but CEF never told) so a tile created off-screen establishes hidden instead of pumping 60fps blank. Closes C-7. F-4 (CefWebSession.swift resizeWatchdog): never force-promote while hidden — the pending surface is zero-filled (the gated pump never painted it). Wait; F-1's un-hide repaint drives a real present that promotes via the normal path. Closes C-1a/c (the active anti-heal). F-5 (FlutterCefPlugin.swift onHostDied): dispose the session before niling the maps, so unregisterTexture runs. Previously every host crash leaked the texture + CVPixelBuffer + IOSurface for the engine's lifetime — asymmetric vs onBrowserFailed which disposes. Closes C-2 (HIGH). cef_host compiles; the Swift compiles in the consuming app. --- docs/OSR_VISIBILITY_RESIZE_AUDIT.md | 139 ++++++++++++++++++ .../macos/Classes/CefWebSession.swift | 20 ++- .../macos/Classes/FlutterCefPlugin.swift | 8 + .../flutter_cef_macos/native/cef_host/main.mm | 59 +++++++- 4 files changed, 216 insertions(+), 10 deletions(-) create mode 100644 docs/OSR_VISIBILITY_RESIZE_AUDIT.md diff --git a/docs/OSR_VISIBILITY_RESIZE_AUDIT.md b/docs/OSR_VISIBILITY_RESIZE_AUDIT.md new file mode 100644 index 0000000..0fc21b9 --- /dev/null +++ b/docs/OSR_VISIBILITY_RESIZE_AUDIT.md @@ -0,0 +1,139 @@ +# CEF Off-Screen-Rendering Visibility / Resize / Cull / Lifecycle Audit + +Repos: +- **flutter_cef** = `/Users/wenkaifan/.pub-cache/git/flutter_cef-c29b93f39b74be493f726130ae524e39c374ff66` + - native = `packages/flutter_cef_macos/native/cef_host/main.mm` + - swift = `packages/flutter_cef_macos/macos/Classes/{CefWebSession,CefProfileHost,FlutterCefPlugin}.swift` + - dart = `lib/src/{cef_web_view,cef_web_controller}.dart` +- **Campus** = `/Users/wenkaifan/Dev/work_canvas_agentui_test` (HEAD `de0f9458`, pins flutter_cef `c29b93f`) + +--- + +## Part 1 — Confirmed issues, deduped & ranked by severity + +After dedup, the 18 confirmed entries collapse to **12 distinct defects**. The 6 "resize/visibility-while-hidden wedge" entries are genuinely distinct *members of one class* (distinct triggers, shared root + shared fix), so they are grouped as **C-1** with one row per member. + +### HIGH + +#### C-1. The resize/visibility-while-culled wedge class (the proven family) +Shared root: a CEF surface receives a resize **or** is shown/hidden while the per-slot begin-frame pump is gated off (`main.mm:283` `if (slot->visible)`), and **nothing forces a repaint on un-hide** (`DoSetVisible(true)` at `main.mm:1484-1487` only calls `WasHidden(false)` — no `Invalidate`/`WasResized`/`SendExternalBeginFrame`). The per-session `resizeWatchdog` then actively force-promotes a never-painted surface. No native or consumer self-heal for an already-painted tile, because the C1 first-present watchdog is retired forever on first paint (`CefProfileHost.swift:653-658`, re-arm gated on `firstPresentPending.contains` at `:702`). + +| # | Member / trigger | Repo · file:line | Severity | +|---|---|---|---| +| C-1a | **W/H geometry resize while paint-culled** — `_ensureSession` resizes on any `_lastSize!=size` with no visibility guard; cull keeps the subtree mounted+laid-out (`CullByViewport` is a paint/hit-test-only `RenderProxyBox`). Fresh IOSurface never painted while hidden → `resizeWatchdog` force-promotes blank at +300ms. Permanent blank for static pages. | dart `cef_web_view.dart:255-263`; native `main.mm:283,1417-1438,1484-1487`; swift `CefWebSession.swift:291-323`; campus `viewport_cull.dart:860`, `cef_webview_tile.dart:612`, `content_layer.dart:806-808` | **high** | +| C-1b | **DoSetVisible(true) re-asserts no geometry/begin-frame** — a resize that landed while hidden is never re-applied on un-cull; Dart already advanced `_lastSize/_lastDpr` so no corrective resize either. | native `main.mm:1484-1487`, `main.mm:1432-1437`; swift `CefWebSession.swift:291-318`; dart `cef_web_view.dart:255-262` | **high** | +| C-1c | **resizeWatchdog force-promotes a never-painted (zero-filled) surface for a hidden browser** — no visibility gate; comment's premise ("pump has been painting into it the whole time") is provably false while hidden. Active anti-heal. Live-proven by commit `de0f9458`. | swift `CefWebSession.swift:287-323`; native `main.mm:283,691-692,1480-1487` | **high** | +| C-1d | **Un-cull (setVisible(true)) issues no repaint on an *evicted* surface** — Chromium `FrameEvictionManager` reclaims off-screen compositor frames (>~5 browsers / memory pressure); `WasHidden(false)` returns blank/stale and resuming begin-frames does NOT repaint (documented empirically: `specs/osr-ecosystem-survey.md:114-116`, `osr-many-views.md:138`). Same-size scroll-back sends no resize (`same` guard `CefWebSession.swift:228-239`), so resizeWatchdog never arms either. Permanent silent blank. | native `main.mm:1484-1487`; swift `CefProfileHost.swift:702`, `CefWebSession.swift:228-239`; campus `cef_webview_tile.dart:612`, `agent_ui_tile.dart:1863` | **high** | +| C-1e | **Resizing a culled cefWebview tile wedges blank** (consumer-framed instance of C-1a) — campus-resize-off-screen / area-zone relayout. Maintainers' own comment documents it (`platform_view_live_mode.dart:149-160`). | dart `cef_web_view.dart:255-263`; campus `viewport_cull.dart:879-881`, `platform_view_live_mode.dart:149-161` | **high** | +| C-1f | **DPR change while culled wedges even with renderScale pinned to dpr** — the `cefRenderScale=>dpr` stabilizer (`platform_view_live_mode.dart:161`) removed only the *zoom* trigger; a mixed-DPI monitor drag / display-scaling change still flows dpr through `renderScaleOf` → resize-while-hidden. Not peer-specific (owners too). Heals on any later visible resize; permanent only on passive scroll-back. | campus `platform_view_live_mode.dart:161,203-210`, `agent_ui_tile.dart:2038`; native `main.mm:283,1484-1487`; swift `CefWebSession.swift:291-314` | **medium** | + +> Note C-1f is rated medium (narrow concurrent trigger), the rest high. All six are closed by the same two native fixes (see Fix Plan F-1, F-2). + +#### C-2. Texture + IOSurface + CVPixelBuffer leak on every cef_host crash +`onHostDied` nils `sessions[sid]`/`sessionHost[sid]` **without** calling `session.dispose()`, so `registry.unregisterTexture` (its only caller, `CefWebSession.swift:476` inside `dispose()`) never runs; the later Dart `controller.dispose()` early-returns in `disposeSession` (`guard let session = sessions[id] else { return }`). `FlutterTextureRegistry` pins the texture; no `deinit`. Per crashed browser: `CefWebSession` + textureId + `CVPixelBuffer` + IOSurface + any un-promoted `pendingBuffer` leak for engine lifetime. Asymmetric vs `onBrowserFailed`/respawn-failure which DO dispose. +- swift `FlutterCefPlugin.swift:489-495` (vs dispose sites 518/576/654/660/663); `cef_web_controller.dart:888`; `FlutterCefPlugin.swift:645`; `CefWebSession.swift:185,476` +- **Severity: HIGH** (unbounded resource leak under the exact condition — host crashes — where recovery happens most). + +### MEDIUM + +#### C-3. No post-establishment liveness watchdog (native) + no consumer-side stale detector +Merges the two detector-gap findings. C1 first-present watchdog retires permanently at first paint (`firstPresentArrived` removes id from both `firstPresentPending` and `watchdogArmed`, `CefProfileHost.swift:653-658`; `checkFirstPresent` bails on `guard stillBlank`, `:721-745`); `resizeWatchdog` only covers the in-flight-resize window (`CefWebSession.swift:293`). A single browser's renderer/GPU stall inside a *shared* host keeps the host pipe alive → no `processGone`. Consumer `recover()` is triggered *only* by `onPaintStalled`(first-paint-only)/`onProcessGone`/establishment-watchdog (`cef_session_controller.dart:120-123,136-143,158-160`). A browser that paints ≥1 frame then wedges has **no detector at either layer**. +- swift `CefProfileHost.swift:653-658,721-745,1068`; `CefWebSession.swift:291-323`; campus `cef_session_controller.dart:120-123,136-160`, `platform_view_live_mode.dart:180-187`, `cef_webview_tile.dart:231-238,340-342` +- **Severity: MEDIUM** (missing detector / defense-in-depth; impact contingent on a post-first-paint wedge — but C-1d is exactly such a wedge, so this gap is what lets C-1d stay silent). + +#### C-4. Visibility op outruns create → tile establishes VISIBLE and paints off-screen +Merges the "PACED create" and "queued opCreateBrowser on busy shared host" findings (same root). `send()` never gates `opSetVisible` (only `opResize` gets the `!createEnqueued` guard, `CefProfileHost.swift:761`); on a connected shared host with the K=3 pacer full the create sits in `createSendQueue` while `opSetVisible(false)` reaches the wire first. cef_host drops it (`main.mm:1938-1942` `if(!slot) break;`); create payload carries no visibility (`{w}{h}{dpr}{sid}{url}`, `sendCreate:504-524`); slot defaults `visible=true` (`main.mm:238`) and pumps at 60fps. `hiddenBrowsers` desyncs (noteVisibility ran before the dropped write, suppressing the C1 watchdog). cefWebview is the exposed consumer (`cef_webview_tile.dart:612`, ungated); agent_ui is guarded (defers on `_sessionReady`, replays in onPageStarted `agent_ui_tile.dart:1799-1804`) but its eager-spawn-adopt path is still vulnerable. +- swift `CefProfileHost.swift:752-775,504-524,689,1051-1056`; native `main.mm:238,283,1293,1867,1938-1942`; campus `cef_webview_tile.dart:612`, `viewport_cull.dart:988-997`, `canvas_snapshot_restore.dart:87-107` +- **Severity: MEDIUM** (power/perf, not crash/data; self-heals on next visibility flip; agent_ui immune). + +#### C-5. Single CEF UI thread couples every tile on a shared host +Per-present synchronous GPU blit (`[cb waitUntilCompleted]`) plus all resize/dispose/input/visibility tasks serialize on one TID_UI; each slot's `PumpBeginFrame` ticks independently at 16ms with no cross-slot fairness or aggregate cap. N visible tiles = N uncoordinated 60fps pumps through one thread. +- native `main.mm:280-291,673-677` +- **Severity: MEDIUM** (scalability/latency ceiling; low-medium in practice on Apple Silicon unified memory — blit "~neutral" per comment `:677`). + +### LOW + +#### C-6. Un-hide drives no immediate begin-frame (~100ms first-repaint latency) +`DoSetVisible` calls `WasHidden(false)` only; first post-show frame waits for the next hidden-cadence pump tick (`slot->visible ? 16 : 100`, `main.mm:291`). Stale (not blank) frame for already-painted tiles. Same root as C-1b; the immediate-kick idiom exists in `DoResize`(1437)/`DoInvalidate`(1818) but not `DoSetVisible`. +- native `main.mm:1484-1487,280-291`; **Severity: LOW**. + +#### C-7. OnAfterCreated does not reconcile slot->visible +A `setVisible(false)` resolving a non-null-but-unbound slot runs `DoSetVisible` with `browser==null`, skipping `WasHidden` (guarded `if(slot->browser)`); `OnAfterCreated` binds the browser but never reconciles desired visibility (it DOES reconcile `close_requested` via the H3 pattern — asymmetric). Residue: missing blink page-hidden throttling on a tile created off-screen and never revealed; one wasted paint per resize-while-culled (`DoResize` `SendExternalBeginFrame` is unconditional, `main.mm:1437`). +- native `main.mm:238,1032-1048,1307-1310,1437,1485-1486`; **Severity: LOW**. + +#### C-8. onPaintStalled is first-paint-only; no consumer-side post-establishment detector +Consumer-side framing of C-3 (kept distinct because the fix is a consumer un-hide freshness check). `onSurface`/`getFrameSurface` feed only the peer-stream mirror; `setVisible(true)` on un-hide verifies no fresh frame. +- swift `CefProfileHost.swift:1069,1080,653-658,624`; campus `cef_session_controller.dart:120-122`, `cef_webview_tile.dart:231-238,340-342`; **Severity: LOW**. + +#### C-9. cefWebview recover() does not replay viewport-visibility to the swapped controller +Merges the two recover-visibility findings. `setViewportVisible(v) => controller.setVisible(v)` stores no state; `recover()` builds a fresh visible controller; `_onRecreate` only invalidates the peer-stream surface. `CullByViewport` sits above the generation-rebuilt subtree and is edge-triggered, so it never re-fires while the cull bool stays false. Recover-while-culled → fresh session paints off-screen until the next viewport-edge flip. agent_ui is immune (wraps body in `_TileViewportVisible` above the generation builder, replays in `didChangeDependencies`). +- campus `cef_webview_tile.dart:611-612,258-262,488-518`; `content_layer.dart:806-808`; `viewport_cull.dart:988-997`; `cef_session_controller.dart:176-200`; `agent_ui_tile.dart:686-688,1804,1857-1864`; **Severity: LOW**. + +#### C-10. Eager-warmed and headless-CDP owner sessions run VISIBLE with no body to pause them +`_eagerWarmCefTiles` warms up to 4 tiles in snapshot order with no per-tile viewport-rect check; warm session created visible. The only `setVisible` producer (`content_layer.dart:806-808`) is nested inside `BuildNearViewport.builder`, which never runs for off-screen tiles, so `onVisibility` never fires. agent_ui's instance `setViewportVisible` only updates the notifier (`agent_ui_tile.dart:1203`) — with no mounted body nothing applies it. Headless-CDP path (`agent_ui_tile.dart:523-533`) is a genuine no-body case, bounded only by agent behavior. +- campus `canvas_snapshot_restore.dart:87-107`; `cef_session_controller.dart:131-169`; `viewport_cull.dart:327-377`; `agent_ui_tile.dart:1203`; dart `cef_web_controller.dart:785`; **Severity: LOW** (restore ≤4; medium-defensible for headless-CDP — never self-heals). + +#### C-11. URL prop change during cold-start is silently dropped (raw CefWebView consumers) +`create()` captures the OLD url synchronously; `didUpdateWidget` gates `navigate()` on `_textureId != null`; post-create `_ensureSession` only resizes; no controller reconcile. Campus is masked (cefWebview navigates explicitly; agent_ui uses `loadHtmlString`). Latent raw-consumer API gap. +- dart `cef_web_view.dart:236-237,196-198,249,255-263`; `cef_web_controller.dart:467-524`; **Severity: LOW**. + +#### C-12. agent_ui owner double-loads the document on warm-spawn +Instance `onCreated` load (`_syncInstanceCdpDocument:613`) + body `initState` load (`_loadCurrentHtml:1874`) both fire — two `data:` navigations on the same warm-spawned controller, no de-dup. No startup flash (both loads identical, designed idempotent supersede); narrow real edge = loses ephemeral agent-driven CDP DOM/scroll state from the headless window. +- campus `canvas_snapshot_restore.dart:87-107`, `agent_ui_tile.dart:474,494-511,613,717-727,1850-1853,1874,1799`; `cef_session_controller.dart:160`; **Severity: LOW**. + +--- + +## Part 2 — Fix Plan (grouped by repo, keystone-first) + +### flutter_cef — native (`cef_host/main.mm`) — KEYSTONE +These two close the entire C-1 class (a–f) and C-6, and are the fixes commit `de0f9458` explicitly defers to flutter_cef. + +- **F-1 (keystone): Force a repaint + geometry re-assert on un-hide.** In `DoSetVisible(true)` (`main.mm:1484-1487`), on the hidden→visible edge, after `WasHidden(false)`: call `NotifyScreenInfoChanged()` (if dpr changed while hidden) + `WasResized()` + `SendExternalBeginFrame()` against current `slot->width/height/dpr` (mirror the existing immediate-kick in `DoResize:1437` / `DoInvalidate:1812-1818`). Ideally gate on a `resize_pending_on_show` flag set whenever `DoResize` runs while `!slot->visible`. This alone is sufficient for a static page and also fixes C-6's ~100ms latency. Closes **C-1b, C-1d, C-1e, C-1f, C-6**. +- **F-2 (keystone): Don't paint/promote while hidden — pair with F-1.** Make the `DoResize` begin-frame conditional on `slot->visible` (today unconditional at `:1437`); keep the surface/dims swap so geometry is current, defer the frame to F-1's un-hide kick. (The Swift half is F-4.) +- **F-3: Reconcile slot->visible in OnAfterCreated.** After binding `slot_->browser` and the `close_requested` check, before starting the pump (`main.mm:1032-1048`): `if (slot_->browser->GetHost() && !slot_->visible) slot_->browser->GetHost()->WasHidden(true);` — mirrors the H3 deferred-intent pattern. Closes **C-7**. +- **F-8 (C-4 preferred): Carry an initial-visible byte in the create payload.** Extend `{w}{h}{dpr}{sid}{url}` (`sendCreate:504-524` + `DoCreateBrowser` signature `:1293/1867`) and set `slot->visible` from it BEFORE `OnAfterCreated` starts `PumpBeginFrame`. Closes **C-4** at the engine (single source of truth). +- **F-9 (C-5): Host-level begin-frame fairness/cap.** Replace N independent 16ms pumps (`:280-291`) with a host pacer that round-robins/budgets `SendExternalBeginFrame` across slots and degrades per-tile cadence as visible-slot count grows; add a TID_UI-saturation detector. Do **not** drop `[cb waitUntilCompleted]` (CEF reclaims `view_src` on callback return — true zero-copy impossible). Mitigates **C-5**. + +### flutter_cef — Swift +- **F-4 (keystone, pairs with F-2): Make resizeWatchdog visibility-aware.** Thread hidden state into `CefWebSession` (mirror `CefProfileHost.hiddenBrowsers` via the existing `setVisible` plumbing, `CefWebSession.swift:365-367`); in the `givenUp` branch (`:291-323`) **defer** force-promotion while hidden — keep serving the old `pixelBuffer` until a real present for `pendingSurfaceId` lands after un-hide. Closes the anti-heal in **C-1a, C-1c**. +- **F-5 (C-2): Dispose the session before niling the maps in onHostDied.** In `FlutterCefPlugin.swift:489-495`: capture `let session = self.sessions[sid]`, nil the four maps, then `session?.dispose()` (zeroes textureId under `bufferLock`, calls `unregisterTexture`). Optionally `host.shutdown()` first to match `disposeSession` ordering. Closes **C-2**. +- **F-6 (C-3): Steady-state per-browser liveness probe.** Track `lastPresentNs` per browser (set where `presentCount` bumps, `CefProfileHost.swift:1068`). Periodic sweep over live, visible (not in `hiddenBrowsers`), already-established browsers: if no present for a generous env-tunable window, send `opInvalidate` (the only discriminator between healthy-idle-static and wedged); if still none after a short grace, emit `onPaintStalled(id)` → routes into Campus's existing bounded `recover()`. Exempt hidden + `firstPresentPending` browsers. Closes **C-3** (and gives **C-8** a backstop). +- **F-8b (C-4 alt): Replay last opSetVisible on opCreated** (`CefProfileHost.swift:1051-1056`) — equivalent to F-8 if the payload approach is undesirable. + +### flutter_cef — Dart (`cef_web_view.dart`) +- **F-7 (C-1 belt-and-suspenders): Visibility-aware resize defer.** Give `CefWebView` a `visible`/`paused` signal (Campus already tracks via `setViewportVisible`); in `_ensureSession` (`:255-263`) defer the `resize()` branch while hidden — record requested size, **do NOT advance `_lastSize/_lastDpr`** — then force-apply the coalesced latest size on un-hide. A pure-native fix leaves `_lastSize` advanced past an unpainted buffer, so this complements F-1/F-4. +- **F-10 (C-11): Navigate-on-drift after cold-start.** In `_ensureSession`, capture `final createdUrl = widget.url;` and after create resolves + `_textureId` set: `if (mounted && widget.url != createdUrl) _controller.navigate(widget.url);` (compare against captured create-url, not the racy `_controller.url.value`). Closes **C-11**. + +### Campus (`work_canvas_agentui_test`) +- **F-11 (C-1 stopgap until F-1/F-4 land): Hold renderScale/size while hidden.** In `_CefSurfaceView`/`_ensureSession` callers, freeze the `renderScale` passed to `CefWebView` to its last-visible value while `_desiredVisible == false`; re-apply latest on un-cull so a single corrective resize-while-visible paints. Interim mask for **C-1f**. +- **F-12 (C-9): Replay viewport-visibility on recover for cefWebview.** Add `bool _viewportVisible` to `_CefWebviewTileInstance`, set it in `setViewportVisible`, re-apply in `_onRecreate` deferred to the new session's `onCreated` — or wrap the body in the same `_TileViewportVisible` notifier agent_ui uses (hoist it to a shared file). Closes **C-9**. +- **F-13 (C-10): Warm-spawn off-screen sessions hidden.** In both `warmSpawnCef` impls, push `controller.setVisible(false)` by default for tiles not currently near-viewport (and the headless/no-body case); un-hide on first cull-visible. Make agent_ui's instance apply `setVisible` to the owner controller directly (gated on `_session?.isCreated`) like cefWebview, rather than only setting the notifier. Closes **C-10**. +- **F-14 (C-12): De-dup the agent_ui owner double-load.** Add a shared "loaded current description revision" marker set in `_syncInstanceCdpDocument`; body checks it before `_loadCurrentHtml` in `initState` and skips when the live doc already matches. Closes **C-12**. +- **F-15 (C-4 belt-and-suspenders): Gate `cef_webview_tile.dart:612`** to defer `setViewportVisible` until the session is created and re-assert on create (mirror agent_ui's onPageStarted replay). Secondary to F-8. + +### Re-enabling the dpr×clamp(zoom,1,3) crispness +The Campus stabilizer pinned `cefRenderScale(dpr,zoom)=>dpr` (`platform_view_live_mode.dart:161`) only to remove the *zoom*-density resize trigger. **Landing F-1 + F-4 (un-hide repaint + don't-resize/promote-while-hidden) is the gate** — those make every resize-while-hidden self-heal on un-cull, after which Campus can revert line 161 to the intended `dpr*clamp(zoom,1,3)` and un-pin `renderScaleOf`. F-7 (Dart defer) and F-11 (Campus freeze) are sufficient interim partial cover but do not by themselves make crispness safe to restore — the native un-hide repaint is required because Dart cannot force a frame into an evicted/hidden surface. + +**Landing order:** F-1 → F-2/F-4 (keystone pair, closes C-1 class) → F-5 (C-2 leak, independent, high) → F-6 (C-3 liveness, gives C-1d/C-8 a backstop) → F-3, F-8 (create-time correctness) → F-7/F-10 (Dart) → F-11–F-15 (Campus) → F-9 (C-5, larger refactor) → **then** restore crispness. + +--- + +## Part 3 — Completeness note: transition classes NOT covered (round-2 targets) + +The audit was deep on **single-host hide/show, resize, dpr, cull, recover, crash-of-whole-host, and create-ordering**. The following transition classes named in the mission were **not** (or only glancingly) exercised and are open for a round-2 file:line-anchored pass: + +1. **Secondary-window / multi-view promotion teardown.** The two multi-view findings (`isPrimaryFlutterView` non-reactive gate `agent_ui_tile.dart:698`; owner controller double-register during a primary-view flip) were **refuted**, but the refutations leaned on `WorkCanvasMultiWindowRoot.didChangeMetrics` (`work_canvas_multi_window_root.dart:55-92`) actually firing on macOS window close/minimize. That platform assumption was reasoned, not observed. Round-2: confirm the metrics event empirically, and audit the *dying* primary's `CullByViewport.detach` (it sends no final `setVisible(false)`) plus IOSurface/texture handoff when a tile's owning `View` migrates between windows. + +2. **Slot/wire-id reuse across a host respawn under load.** The "verified clean" coverage note checked monotonic `nextBrowserId` + host-identity filter, but only statically. Round-2: a stress test that crashes a shared host mid-resize with many in-flight presents, checking for present-tag → wrong-session delivery during the respawn window (`FlutterCefPlugin.swift:477-503`, `:534-539`). + +3. **Process-gone of a *single renderer* inside a shared host (not whole-host EOF).** C-3 establishes there is no detector; nobody traced what CEF does to the *other* slots' begin-frame pump / GPU channel when one renderer in a shared `cef_host` dies (GPU-process loss vs renderer-process loss). Round-2: does a renderer crash stall TID_UI or the Metal blit for sibling tiles? + +4. **IOSurface lifetime across rapid resize churn + dispose interleave.** Findings covered force-promote of a single pending buffer; not covered: a dispose racing a chain of `maybeSendNextResize` (`CefWebSession.swift:331`) leaving an orphaned `pendingBuffer`/`ioSurface` retain, or an OS-recycled IOSurface global id colliding mid-resize. + +5. **Begin-frame credit / pump leak on dispose.** `PumpBeginFrame` self-reposts and only dies on slot dispose (`main.mm:282`). Not audited: whether a dispose that races `OnAfterCreated` (slot inserted `:1307` but browser not yet bound) can leave a self-reposting pump targeting a half-torn-down slot, or double-start the pump. + +6. **CDP / agent-browser driving a culled or mid-resize surface.** `campus webview`/CDP input delivery to a hidden or being-resized session (input→disposed/wrong session) was not exercised; the headless-CDP path (C-10) was only analyzed for visibility, not for input/eval routing during a recover swap. + +7. **Software (non-accelerated) paint path.** All wedge analysis assumed the Metal/IOSurface accelerated path. `CompositeSoftwareLocked` (`main.mm:652-658`) and `BlitBGRA` were only touched in refuted crop findings; the software fallback's behavior under hide/resize was not separately verified. + +8. **Display reconfiguration / GPU reset / sleep-wake.** Monitor hot-plug, GPU switch (discrete↔integrated), and system sleep/wake invalidate Metal devices and IOSurfaces wholesale — entirely outside this audit's scope and a likely source of post-establishment blank-with-no-detector wedges (overlaps C-3). diff --git a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift index b68d69b..2371fc2 100644 --- a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift +++ b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift @@ -156,6 +156,11 @@ final class CefWebSession: NSObject, FlutterTexture { // has since gone out — so during a smoothly-advancing drag the watchdog is a no-op, and it // only acts when a resize wedges (generation stops advancing because no present came). private var resizeGen: UInt64 = 0 + // F-4: mirrors the cef_host slot's hidden state (set by setVisible). While hidden the + // begin-frame pump is gated off so no present can land — the resize watchdog must NOT + // force-promote a never-painted (blank) buffer; it waits for the native un-hide repaint + // (F-1) to drive a real present. Guarded by bufferLock like the rest of the buffer state. + private var hidden = false private let bufferLock = NSLock() /// The live IOSurface id this session's buffer is backed by, or 0 before @@ -291,7 +296,12 @@ final class CefWebSession: NSObject, FlutterTexture { private func resizeWatchdog(_ gen: UInt64) { bufferLock.lock() let active = resizeInFlight && gen == resizeGen - let givenUp = active && (nowNs() &- resizeSentAtNs) > 300_000_000 + // F-4: never force-promote while hidden — the pending surface is zero-filled (the gated + // pump never painted it), so promoting it wedges the texture permanently blank. Wait + // instead; the native hidden->visible repaint (F-1) drives a real present that promotes + // the pending buffer through the normal present path. + let isHidden = hidden + let givenUp = active && !isHidden && (nowNs() &- resizeSentAtNs) > 300_000_000 var promotedTid: Int64 = 0 var promotedSid: UInt32 = 0 var promotedW = 0, promotedH = 0 @@ -316,7 +326,10 @@ final class CefWebSession: NSObject, FlutterTexture { return } guard active else { return } - sendFrame(Self.opInvalidate, []) + // While hidden, opInvalidate can't paint (the pump is gated) — skip the nudge but keep + // the watchdog alive so it resumes promoting once visible; the un-hide repaint promotes + // via the present path first, after which resizeInFlight clears and this self-terminates. + if !isHidden { sendFrame(Self.opInvalidate, []) } DispatchQueue.main.asyncAfter(deadline: .now() + 0.08) { [weak self] in self?.resizeWatchdog(gen) } @@ -363,6 +376,9 @@ final class CefWebSession: NSObject, FlutterTexture { /// CefBrowserHost::WasHidden(true) so an off-screen tile stops rendering; the /// session and browser stay alive, so it's a cheap toggle, not a teardown. func setVisible(_ visible: Bool) { + bufferLock.lock() + hidden = !visible + bufferLock.unlock() sendFrame(Self.opSetVisible, [visible ? 1 : 0]) } diff --git a/packages/flutter_cef_macos/macos/Classes/FlutterCefPlugin.swift b/packages/flutter_cef_macos/macos/Classes/FlutterCefPlugin.swift index bb77543..c021c6e 100644 --- a/packages/flutter_cef_macos/macos/Classes/FlutterCefPlugin.swift +++ b/packages/flutter_cef_macos/macos/Classes/FlutterCefPlugin.swift @@ -488,6 +488,14 @@ public class FlutterCefPlugin: NSObject, FlutterPlugin { let goneSessions = self.sessionHost.compactMap { $0.value === host ? $0.key : nil } for sid in goneSessions { self.emit("processGone", ["sessionId": sid, "reason": reason]) + // F-5: dispose the session BEFORE niling the maps. dispose() is the only caller of + // registry.unregisterTexture (+ frees the CVPixelBuffer / IOSurface / any pending + // buffer). If we just nil sessions[sid], the later Dart controller.dispose -> + // disposeSession early-returns on the now-missing session, so the texture + surfaces + // leak for the engine's lifetime — on EVERY host crash, exactly when recovery (a + // fresh create) happens most. (onBrowserFailed / respawn-failure already dispose; + // this path was the asymmetric leak.) + self.sessions[sid]?.dispose() self.sessions[sid] = nil self.sessionHost[sid] = nil self.sessionKey[sid] = nil diff --git a/packages/flutter_cef_macos/native/cef_host/main.mm b/packages/flutter_cef_macos/native/cef_host/main.mm index bf7a287..7fcc429 100644 --- a/packages/flutter_cef_macos/native/cef_host/main.mm +++ b/packages/flutter_cef_macos/native/cef_host/main.mm @@ -237,6 +237,12 @@ // DoSetVisible); `begin_frame_pump_started` guards a double-start. UI-thread only. bool visible = true; bool begin_frame_pump_started = false; + // F-1/F-2: a dpr/screen-info change that lands while the slot is HIDDEN is deferred — + // the begin-frame pump is gated off while hidden, so notifying + painting now would + // composite into a surface nothing displays and mislead the Swift resize watchdog into + // promoting a never-painted buffer. DoResize sets this while hidden; DoSetVisible's + // hidden->visible edge re-asserts screen info before forcing a full repaint. UI-thread only. + bool needs_screen_info_on_show = false; // Per-slot pump-tick + accelerated-paint counters, logged from PumpBeginFrame when // FLUTTER_CEF_DEBUG is set — diagnostics for paint-stall investigation at scale. uint64_t diag_pump_ticks = 0; @@ -1039,6 +1045,12 @@ void OnAfterCreated(CefRefPtr browser) override { browser->GetHost()->CloseBrowser(true); return; } + // F-3: reconcile a visibility intent that arrived before the browser bound. A + // setVisible(false) on a still-creating slot ran DoSetVisible with browser==null + // (WasHidden skipped), so slot_->visible is already false but CEF never heard it — + // the slot would establish VISIBLE and pump at 60fps off-screen until the next flip. + // Honor the recorded intent now (mirrors the close_requested deferred-intent pattern). + if (!slot_->visible) browser->GetHost()->WasHidden(true); // Start the external begin-frame pump now that the browser is bound. We turned the internal // frame timer OFF (external_begin_frame_enabled), so without this nothing ever paints. if (!slot_->begin_frame_pump_started) { @@ -1428,13 +1440,24 @@ void DoResize(const std::shared_ptr& slot, int w, int h, slot->dst_mtl_sid = 0; } if (slot->browser) { - // A device-scale change needs the renderer told (screen info), not just a relayout. - if (dpr_changed) slot->browser->GetHost()->NotifyScreenInfoChanged(); - slot->browser->GetHost()->WasResized(); - // Drive a frame right now at the new size. With external begin-frame this is a guaranteed - // tick (not a coalesce-able Invalidate request), so the re-laid-out content composites into - // the new surface immediately; PumpBeginFrame's ongoing ticks cover the heavy-page settle. - slot->browser->GetHost()->SendExternalBeginFrame(); + if (slot->visible) { + // A device-scale change needs the renderer told (screen info), not just a relayout. + if (dpr_changed) slot->browser->GetHost()->NotifyScreenInfoChanged(); + slot->browser->GetHost()->WasResized(); + // Drive a frame right now at the new size. With external begin-frame this is a guaranteed + // tick (not a coalesce-able Invalidate request), so the re-laid-out content composites into + // the new surface immediately; PumpBeginFrame's ongoing ticks cover the heavy-page settle. + slot->browser->GetHost()->SendExternalBeginFrame(); + } else { + // F-2: HIDDEN — the begin-frame pump is gated off (PumpBeginFrame skips while + // !visible), so WasResized()+SendExternalBeginFrame() here would never paint the + // freshly-swapped (blank) surface, yet the Swift resizeWatchdog would force-promote + // it to the live texture → permanent blank on a static page. The surface + dims are + // already swapped above (geometry is current); defer the screen-info re-assert + the + // repaint to DoSetVisible's hidden->visible edge (F-1). WasResized while hidden is + // pointless (no frame can result), so it is dropped, not deferred. + if (dpr_changed) slot->needs_screen_info_on_show = true; + } } } @@ -1482,8 +1505,28 @@ void DoSetZoom(const std::shared_ptr& slot, double level) { // alive, so this is a cheap pause/resume — not a teardown. The host pauses a // tile that scrolls fully out of the canvas viewport and resumes it on return. void DoSetVisible(const std::shared_ptr& slot, bool visible) { + const bool was_visible = slot->visible; slot->visible = visible; // PumpBeginFrame reads this to idle the begin-frame pump while hidden - if (slot->browser) slot->browser->GetHost()->WasHidden(!visible); + if (!slot->browser) return; + slot->browser->GetHost()->WasHidden(!visible); + // F-1 (keystone): on the hidden->visible edge, FORCE a fresh full-viewport repaint at the + // current geometry. WasHidden(false) alone does NOT repaint, and three things can have left + // the live texture blank/stale while hidden: (a) a resize landed while the pump was gated off + // (F-2 deferred its paint here); (b) a dpr/screen-info change was deferred; (c) Chromium's + // FrameEvictionManager reclaimed the off-screen compositor frame entirely (happens past ~5 + // browsers / under memory pressure) so there is nothing to show even though geometry is + // unchanged. Re-assert screen info (if a dpr change was deferred) + size, then drive a + // guaranteed frame — mirrors DoResize/DoInvalidate. Unconditional on the edge because the + // eviction case carries no resize to key off. + if (visible && !was_visible) { + if (slot->needs_screen_info_on_show) { + slot->browser->GetHost()->NotifyScreenInfoChanged(); + slot->needs_screen_info_on_show = false; + } + slot->browser->GetHost()->WasResized(); + slot->browser->GetHost()->Invalidate(PET_VIEW); + slot->browser->GetHost()->SendExternalBeginFrame(); + } } void DoFind(const std::shared_ptr& slot, const std::string& text, bool forward, bool match_case, bool find_next) { From 9a864ad2d6b319f52c028dc504b1d8cd2f12baf2 Mon Sep 17 00:00:00 2001 From: wenkaifan0720 Date: Fri, 26 Jun 2026 16:32:21 -0700 Subject: [PATCH 02/13] test(osr): standalone F-4 watchdog test + real-CEF cull-wedge probe (provable outside Campus) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit flutter_cef fixes must be testable in flutter_cef, single-view, before Campus. - ResizeWatchdogPolicy: extract the resize-watchdog force-promote gating (the F-4 "never promote a hidden / never-painted surface" decision) into a dependency-light pure policy (Swift stdlib only). CefWebSession.resizeWatchdog now calls it. - ResizeWatchdogPolicyTests + run_resize_watchdog_tests.sh: 11 standalone assertions, compiled + run with `swiftc` alone (no Xcode, no pod harness, no Campus) — same pattern as CdpRelayFilterTests. Proves the wedge guard: hidden never promotes; visible+timed-out does; superseded/promoted never do. - example/lib/cull_wedge_probe.dart: a real-CEF single-view probe that drives the exact wedge sequence (setVisible(false) → resize while hidden → setVisible(true)) on a gradient page with a ticking JS clock. The page must REPAINT on show (F-1) — pre-fix it stayed permanently blank. Run: FLUTTER_CEF_HOST= flutter run -d macos -t lib/cull_wedge_probe.dart. --- example/lib/cull_wedge_probe.dart | 150 ++++++++++++++++++ .../macos/Classes/CefWebSession.swift | 16 +- .../macos/Classes/ResizeWatchdogPolicy.swift | 38 +++++ .../test/ResizeWatchdogPolicyTests.swift | 67 ++++++++ .../test/run_resize_watchdog_tests.sh | 9 ++ 5 files changed, 274 insertions(+), 6 deletions(-) create mode 100644 example/lib/cull_wedge_probe.dart create mode 100644 packages/flutter_cef_macos/macos/Classes/ResizeWatchdogPolicy.swift create mode 100644 packages/flutter_cef_macos/test/ResizeWatchdogPolicyTests.swift create mode 100755 packages/flutter_cef_macos/test/run_resize_watchdog_tests.sh diff --git a/example/lib/cull_wedge_probe.dart b/example/lib/cull_wedge_probe.dart new file mode 100644 index 0000000..7ca0c35 --- /dev/null +++ b/example/lib/cull_wedge_probe.dart @@ -0,0 +1,150 @@ +// Cull/visibility WEDGE probe — verifies the OSR surface REPAINTS after the cull +// transitions that used to wedge it permanently blank (only relaunch recovered): +// * setVisible(false) → resize while hidden → setVisible(true) +// * setVisible(false) → setVisible(true) after the off-screen frame is evicted +// The native fix (F-1): DoSetVisible(true) forces a full repaint on the hidden→visible +// edge; (F-2) DoResize defers its paint while hidden; (F-4) the resize watchdog never +// force-promotes a hidden (never-painted) surface. Without these, the page below stays +// BLANK after "Wedge cycle"; with them it reappears (gradient + the ticking clock proves +// the frame is FRESH, not a stale cached one). +// +// Controls: Hide / Show toggle visibility; Resize toggles the logical size; "Wedge cycle" +// runs hide→resize→show automatically. Watch the page: it must come back, filling the +// (possibly new) size, with the clock ticking. +// +// Run (single-view, real cef_host — no Campus): +// FLUTTER_CEF_HOST=<.../cef_host.app/Contents/MacOS/cef_host> \ +// FLUTTER_CEF_ALLOW_INSECURE_PROFILE=1 \ +// flutter run -d macos -t lib/cull_wedge_probe.dart +import 'dart:async'; +import 'package:flutter/material.dart'; +import 'package:flutter_cef/flutter_cef.dart'; + +// Full-bleed gradient + a big label + a JS clock. Blank vs painted is unmistakable, and +// the ticking clock distinguishes a FRESH repaint from a frozen/stale frame. +const _html = ''' + +
RENDERED
+
+
clock ticking = fresh frame
+ +'''; + +void main() => runApp(const WedgeApp()); + +class WedgeApp extends StatefulWidget { + const WedgeApp({super.key}); + @override + State createState() => _WedgeAppState(); +} + +class _WedgeAppState extends State { + final _controller = CefWebController(); + bool _visible = true; + bool _big = false; + String _status = 'ready'; + + @override + void initState() { + super.initState(); + _controller.onPageStarted = (_) => _controller.loadHtmlString(_html); + } + + @override + void dispose() { + _controller.dispose(); + super.dispose(); + } + + void _setVisible(bool v) { + setState(() { + _visible = v; + _status = v ? 'shown' : 'HIDDEN'; + }); + _controller.setVisible(v); + } + + void _toggleSize() => setState(() { + _big = !_big; + _status = 'resized to ${_big ? "480×360" : "360×300"}'; + }); + + // The exact wedge sequence: hide → resize WHILE HIDDEN → show. The page must come back + // at the new size with the clock ticking. Pre-fix it stayed permanently blank here. + Future _wedgeCycle() async { + setState(() => _status = 'cycle: hiding…'); + _setVisible(false); + await Future.delayed(const Duration(milliseconds: 400)); + setState(() { + _big = !_big; + _status = 'cycle: resized while hidden…'; + }); + await Future.delayed(const Duration(milliseconds: 400)); + setState(() => _status = 'cycle: showing — page MUST repaint'); + _setVisible(true); + } + + @override + Widget build(BuildContext context) { + final w = _big ? 480.0 : 360.0; + final h = _big ? 360.0 : 300.0; + return MaterialApp( + debugShowCheckedModeBanner: false, + home: Scaffold( + backgroundColor: const Color(0xFF111722), + body: Column( + children: [ + Container( + width: double.infinity, + color: const Color(0xFF0B1220), + padding: const EdgeInsets.symmetric(horizontal: 12, vertical: 8), + child: Row( + children: [ + Expanded( + child: Text( + 'visible=$_visible size=${w.toInt()}×${h.toInt()} $_status', + style: const TextStyle(color: Colors.white, fontSize: 13), + ), + ), + _btn(_visible ? 'Hide' : 'Show', + () => _setVisible(!_visible)), + const SizedBox(width: 6), + _btn('Resize', _toggleSize), + const SizedBox(width: 14), + _btn('Wedge cycle', _wedgeCycle, wide: true), + ], + ), + ), + Expanded( + child: Center( + // A checkerboard backdrop so a BLANK (wedged) surface reads as obviously + // empty, not just a same-colored void. + child: Container( + color: const Color(0xFF2A3340), + padding: const EdgeInsets.all(24), + child: SizedBox( + width: w, + height: h, + child: CefWebView(url: 'about:blank', controller: _controller), + ), + ), + ), + ), + ], + ), + ), + ); + } + + Widget _btn(String label, VoidCallback onTap, {bool wide = false}) => ElevatedButton( + onPressed: onTap, + style: ElevatedButton.styleFrom( + minimumSize: Size(wide ? 110 : 64, 36), + padding: const EdgeInsets.symmetric(horizontal: 8), + ), + child: Text(label), + ); +} diff --git a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift index 2371fc2..f864021 100644 --- a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift +++ b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift @@ -295,13 +295,17 @@ final class CefWebSession: NSObject, FlutterTexture { /// Main-thread only, so sendFrame / textureFrameAvailable stay serialized. private func resizeWatchdog(_ gen: UInt64) { bufferLock.lock() - let active = resizeInFlight && gen == resizeGen - // F-4: never force-promote while hidden — the pending surface is zero-filled (the gated - // pump never painted it), so promoting it wedges the texture permanently blank. Wait - // instead; the native hidden->visible repaint (F-1) drives a real present that promotes - // the pending buffer through the normal present path. let isHidden = hidden - let givenUp = active && !isHidden && (nowNs() &- resizeSentAtNs) > 300_000_000 + // F-4 (gating extracted to ResizeWatchdogPolicy for standalone unit tests): never + // force-promote while hidden — the pending surface is zero-filled (the gated pump never + // painted it), so promoting it wedges the texture permanently blank. Wait instead; the + // native hidden->visible repaint (F-1) drives a real present that promotes the pending + // buffer through the normal present path. + let active = ResizeWatchdogPolicy.shouldKeepWaiting( + inFlight: resizeInFlight, gen: gen, currentGen: resizeGen) + let givenUp = ResizeWatchdogPolicy.shouldForcePromote( + inFlight: resizeInFlight, gen: gen, currentGen: resizeGen, + hidden: isHidden, elapsedNs: nowNs() &- resizeSentAtNs, thresholdNs: 300_000_000) var promotedTid: Int64 = 0 var promotedSid: UInt32 = 0 var promotedW = 0, promotedH = 0 diff --git a/packages/flutter_cef_macos/macos/Classes/ResizeWatchdogPolicy.swift b/packages/flutter_cef_macos/macos/Classes/ResizeWatchdogPolicy.swift new file mode 100644 index 0000000..9643a61 --- /dev/null +++ b/packages/flutter_cef_macos/macos/Classes/ResizeWatchdogPolicy.swift @@ -0,0 +1,38 @@ +// Pure decision policy for the resize watchdog (CefWebSession.resizeWatchdog) — the +// hidden/in-flight/elapsed gating, with NO dependency on Flutter, CEF, IOSurface, or the +// host IPC. Extracted so the gating that prevents the visibility/resize WEDGE (F-4: never +// force-promote a never-painted surface for a HIDDEN browser) is unit-testable standalone +// — compiles + runs with `swiftc` alone, exactly like CdpRelay's filter tests: +// +// swiftc macos/Classes/ResizeWatchdogPolicy.swift test/ResizeWatchdogPolicyTests.swift \ +// -o /tmp/rwd && /tmp/rwd +// +// Depends only on the Swift stdlib. +import Foundation + +enum ResizeWatchdogPolicy { + /// Whether the watchdog should FORCE-PROMOTE the pending (post-resize) surface to the + /// live texture. The fallback for a static page that produced its one post-resize frame + /// but the present was dropped/mis-tagged. + /// + /// - `inFlight` / `gen` / `currentGen`: a newer resize (gen advanced) cancels this one. + /// - `hidden`: **the F-4 fix** — while hidden the begin-frame pump is gated off, so the + /// pending surface is zero-filled (never painted); promoting it wedges the texture + /// permanently blank. Must NOT promote while hidden — wait for the native un-hide + /// repaint (F-1) to drive a real present that promotes through the normal path. + /// - `elapsedNs` / `thresholdNs`: only after the grace window with no present. + static func shouldForcePromote(inFlight: Bool, gen: UInt64, currentGen: UInt64, + hidden: Bool, elapsedNs: UInt64, + thresholdNs: UInt64) -> Bool { + guard inFlight, gen == currentGen else { return false } // superseded / already promoted + if hidden { return false } // F-4: never promote a hidden (blank) surface + return elapsedNs > thresholdNs + } + + /// Whether the watchdog should keep re-scheduling itself (stay alive) for this resize — + /// true while the resize is still the in-flight one (incl. while hidden, so it resumes + /// promoting once visible). Pairs with [shouldForcePromote]: exactly one is acted on. + static func shouldKeepWaiting(inFlight: Bool, gen: UInt64, currentGen: UInt64) -> Bool { + return inFlight && gen == currentGen + } +} diff --git a/packages/flutter_cef_macos/test/ResizeWatchdogPolicyTests.swift b/packages/flutter_cef_macos/test/ResizeWatchdogPolicyTests.swift new file mode 100644 index 0000000..9d7994c --- /dev/null +++ b/packages/flutter_cef_macos/test/ResizeWatchdogPolicyTests.swift @@ -0,0 +1,67 @@ +// Standalone unit tests for ResizeWatchdogPolicy — the resize-watchdog force-promote +// gating, and specifically the F-4 fix: NEVER force-promote a pending surface while the +// browser is HIDDEN (the gated begin-frame pump never painted it, so promoting it wedges +// the texture permanently blank). ResizeWatchdogPolicy depends only on the Swift stdlib, +// so this compiles + runs without Xcode or the Flutter/pod harness: +// +// ./test/run_resize_watchdog_tests.sh (or) swiftc macos/Classes/ResizeWatchdogPolicy.swift \ +// test/ResizeWatchdogPolicyTests.swift -o /tmp/rwd && /tmp/rwd +import Foundation + +@main +enum ResizeWatchdogPolicyTests { + static var failures = 0 + static func check(_ name: String, _ cond: Bool) { + print((cond ? " PASS " : " FAIL ") + name) + if !cond { failures += 1 } + } + + static let threshold: UInt64 = 300_000_000 // 300ms, matching the watchdog + static let pastGrace: UInt64 = 400_000_000 + static let withinGrace: UInt64 = 100_000_000 + + static func promote(inFlight: Bool = true, gen: UInt64 = 1, currentGen: UInt64 = 1, + hidden: Bool = false, elapsedNs: UInt64 = pastGrace) -> Bool { + ResizeWatchdogPolicy.shouldForcePromote( + inFlight: inFlight, gen: gen, currentGen: currentGen, + hidden: hidden, elapsedNs: elapsedNs, thresholdNs: threshold) + } + + static func main() { + // ── The F-4 fix: HIDDEN must never force-promote, no matter how long it's been ── + check("hidden + timed-out → NO promote (the wedge guard)", + promote(hidden: true, elapsedNs: pastGrace) == false) + check("hidden + way past grace → still NO promote", + promote(hidden: true, elapsedNs: threshold * 100) == false) + + // ── Visible: the normal force-promote fallback still works ── + check("visible + in-flight + past grace → promote", + promote(hidden: false, elapsedNs: pastGrace) == true) + check("visible + within grace → wait, don't promote yet", + promote(hidden: false, elapsedNs: withinGrace) == false) + + // ── Superseded / inactive resizes never promote (visible or not) ── + check("newer resize (gen advanced) → no promote", + promote(gen: 1, currentGen: 2, elapsedNs: pastGrace) == false) + check("not in flight (already promoted) → no promote", + promote(inFlight: false, elapsedNs: pastGrace) == false) + check("newer resize while hidden → no promote", + promote(gen: 1, currentGen: 2, hidden: true, elapsedNs: pastGrace) == false) + + // ── shouldKeepWaiting: the watchdog stays alive while the resize is current, INCLUDING + // while hidden (so it resumes promoting once visible) — independent of `hidden`. ── + check("keep waiting while in-flight + current (visible)", + ResizeWatchdogPolicy.shouldKeepWaiting(inFlight: true, gen: 1, currentGen: 1) == true) + check("keep waiting while in-flight + current (hidden too)", + ResizeWatchdogPolicy.shouldKeepWaiting(inFlight: true, gen: 1, currentGen: 1) == true) + check("stop waiting once superseded", + ResizeWatchdogPolicy.shouldKeepWaiting(inFlight: true, gen: 1, currentGen: 2) == false) + check("stop waiting once promoted (not in flight)", + ResizeWatchdogPolicy.shouldKeepWaiting(inFlight: false, gen: 1, currentGen: 1) == false) + + print(failures == 0 + ? "\nALL ResizeWatchdogPolicy TESTS PASSED" + : "\n\(failures) ResizeWatchdogPolicy TEST(S) FAILED") + exit(failures == 0 ? 0 : 1) + } +} diff --git a/packages/flutter_cef_macos/test/run_resize_watchdog_tests.sh b/packages/flutter_cef_macos/test/run_resize_watchdog_tests.sh new file mode 100755 index 0000000..d14f32c --- /dev/null +++ b/packages/flutter_cef_macos/test/run_resize_watchdog_tests.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Compile + run the standalone ResizeWatchdogPolicy unit tests (the F-4 visibility-gating +# that prevents the resize/cull wedge). ResizeWatchdogPolicy uses only the Swift stdlib, +# so no Xcode/pod harness is needed. +set -euo pipefail +DIR="$(cd "$(dirname "$0")/.." && pwd)" +OUT="$(mktemp -d)/rwd" +swiftc "$DIR/macos/Classes/ResizeWatchdogPolicy.swift" "$DIR/test/ResizeWatchdogPolicyTests.swift" -o "$OUT" +"$OUT" From a884308d7872c48e16298df1d8d363c861d1f183 Mon Sep 17 00:00:00 2001 From: wenkaifan0720 Date: Fri, 26 Jun 2026 16:51:16 -0700 Subject: [PATCH 03/13] =?UTF-8?q?fix(osr):=20F-6=20steady-state=20liveness?= =?UTF-8?q?=20watchdog=20=E2=80=94=20self-heal=20a=20painted-then-wedged?= =?UTF-8?q?=20browser?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes audit C-3: the first-present watchdog RETIRES at first paint (firstPresentArrived), so a browser that painted ≥1 frame then wedged (a renderer/GPU stall inside a shared host that keeps the pipe alive → no processGone) had NO detector — silent blank until relaunch. This is the backstop that makes any future post-establishment wedge self-heal. - LivenessProbePolicy: the pure decision (Swift stdlib only) — staleness → discriminating nudge → declare. A static page legitimately idles, so staleness alone isn't a wedge: an opInvalidate is the discriminator (a healthy page repaints, a wedged one doesn't). 8 standalone swiftc assertions (run_liveness_probe_tests.sh). - CefWebSession: + lastPresentNs / livenessNudgedAt (guarded by browsersLock like presentCount). The reader stamps them on every present. - CefProfileHost: a periodic sweep (every 2s) over established, visible, not-first-paint- pending browsers applies the policy — nudge via opInvalidate, then onPaintStalled → the consumer's BOUNDED recover(). Lock order browsersLock→presentLock (never nested), matching the present handler; stops when the host dies. Env-tunable FLUTTER_CEF_LIVENESS_MS (default 10s) + 3s grace. Policy is standalone-unit-tested; the steady-state sweep needs a live post-paint-wedge to fully exercise (hard to force synthetically — noted). --- .../macos/Classes/CefProfileHost.swift | 83 +++++++++++++++++++ .../macos/Classes/CefWebSession.swift | 7 ++ .../macos/Classes/LivenessProbePolicy.swift | 31 +++++++ .../test/LivenessProbePolicyTests.swift | 62 ++++++++++++++ .../test/run_liveness_probe_tests.sh | 8 ++ 5 files changed, 191 insertions(+) create mode 100644 packages/flutter_cef_macos/macos/Classes/LivenessProbePolicy.swift create mode 100644 packages/flutter_cef_macos/test/LivenessProbePolicyTests.swift create mode 100755 packages/flutter_cef_macos/test/run_liveness_probe_tests.sh diff --git a/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift b/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift index dbe7b0b..dc7cdea 100644 --- a/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift +++ b/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift @@ -303,6 +303,7 @@ final class CefProfileHost { running = true readerStarted = true Thread.detachNewThread { [weak self] in self?.acceptAndRead() } + startLivenessSweep() // F-6: steady-state post-establishment liveness watchdog // Agent-control: drain CDP off fd 3/4's parent ends on a dedicated reader, // splitting the NUL-delimited JSON stream into messages. Started only after // a successful spawn (the fds exist). Joined in shutdown() before close. @@ -744,6 +745,85 @@ final class CefProfileHost { } } + // ── F-6: steady-state liveness watchdog ───────────────────────────────────────────── + // The first-paint watchdog above RETIRES at first paint (firstPresentArrived), so a + // browser that painted ≥1 frame then WEDGES (renderer/GPU stall inside a shared host + // that keeps the pipe alive, so no processGone) had NO detector — silent blank until + // relaunch. This periodic sweep covers steady state. A static page legitimately idles + // (no presents), so staleness alone isn't a wedge: a discriminating opInvalidate is sent + // first (a healthy page repaints → a present lands → cleared); only if no present follows + // within the grace is paintStalled reported, routing into the consumer's BOUNDED recover. + // Decision logic is in LivenessProbePolicy (standalone-unit-tested). + private let livenessStalenessNs: UInt64 = { + if let s = ProcessInfo.processInfo.environment["FLUTTER_CEF_LIVENESS_MS"], + let ms = Double(s), ms > 0 { return UInt64(ms * 1_000_000) } + return 10_000_000_000 // 10s — generous; a wedge is rare + a healthy idle page only + // costs one forced repaint per window. + }() + private let livenessGraceNs: UInt64 = 3_000_000_000 // 3s after the nudge → declare wedged + private let livenessSweepInterval: TimeInterval = 2.0 + private var livenessSweepStarted = false // guarded by browsersLock + + /// Start the periodic liveness sweep once (idempotent). Called after the reader is up. + private func startLivenessSweep() { + browsersLock.lock() + let already = livenessSweepStarted + livenessSweepStarted = true + browsersLock.unlock() + guard !already else { return } + scheduleLivenessSweep() + } + + private func scheduleLivenessSweep() { + writeLock.lock(); let alive = running && !crashed; writeLock.unlock() + guard alive else { return } // host gone → stop sweeping + DispatchQueue.global().asyncAfter(deadline: .now() + livenessSweepInterval) { [weak self] in + self?.livenessSweep() + } + } + + private func livenessSweep() { + let now = DispatchTime.now().uptimeNanoseconds + // 1) Snapshot ESTABLISHED browsers + their liveness state under browsersLock. + browsersLock.lock() + var cands: [(bid: UInt32, sinceLast: UInt64, nudgedAt: UInt64)] = [] + for (bid, s) in browsers where s.firstPresentSeen { + cands.append((bid, now &- s.lastPresentNs, s.livenessNudgedAt)) + } + browsersLock.unlock() + if !cands.isEmpty { + // 2) Exclude hidden (legitimately frameless) + still-first-paint-pending (the first- + // paint watchdog owns those). presentLock is taken AFTER releasing browsersLock — + // never nested — matching the host's browsersLock→presentLock order, so no deadlock. + presentLock.lock() + let hidden = hiddenBrowsers + let pending = firstPresentPending + presentLock.unlock() + for c in cands where !hidden.contains(c.bid) && !pending.contains(c.bid) { + let nudged = c.nudgedAt != 0 + let action = LivenessProbePolicy.evaluate( + sinceLastPresentNs: c.sinceLast, stalenessThresholdNs: livenessStalenessNs, + nudged: nudged, sinceNudgeNs: nudged ? (now &- c.nudgedAt) : 0, + nudgeGraceNs: livenessGraceNs) + switch action { + case .healthy: + break + case .nudge: + // Discriminate: a healthy idle page repaints (clearing the nudge on the present); + // a wedged one stays blank. + send(c.bid, Self.opInvalidate, []) + browsersLock.lock(); browsers[c.bid]?.livenessNudgedAt = now; browsersLock.unlock() + case .declareStalled: + NSLog("[cef] profile '\(profileId)': browser \(c.bid) painted then wedged — reporting paintStalled (consumer may recreate)") + onPaintStalled?(c.bid) + // Re-discriminate next cycle; the consumer's recover() is bounded (kMaxCefRecreate). + browsersLock.lock(); browsers[c.bid]?.livenessNudgedAt = 0; browsersLock.unlock() + } + } + } + scheduleLivenessSweep() + } + /// Frame `[u32 bodyLen=4+1+payload.count][u32 browserId][op][payload]` and /// write it, or queue it if the pipe isn't up yet. A pre-connect opResize whose /// browserId hasn't had its create enqueued is DROPPED — that create carries @@ -1068,6 +1148,9 @@ final class CefProfileHost { s.presentCount += 1 if s.presentCount == 1 { s.firstPresentSeen = true; firstPaint = true } if s.presentCount == estabStableFrames { reachedStableFrames = true } + // F-6: any present clears the liveness-stall state — the browser is alive. + s.lastPresentNs = DispatchTime.now().uptimeNanoseconds + s.livenessNudgedAt = 0 } browsersLock.unlock() if firstPaint { diff --git a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift index f864021..ec06e22 100644 --- a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift +++ b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift @@ -122,6 +122,13 @@ final class CefWebSession: NSObject, FlutterTexture { // first frame — so the next create's first-frame GPU allocation can't knock a barely- // established browser back out. var presentCount = 0 + // F-6 steady-state liveness watchdog (guarded by CefProfileHost.browsersLock, like + // presentCount). `lastPresentNs` = the most recent present's uptime; `livenessNudgedAt` + // = uptime of an outstanding discriminating opInvalidate (0 = none). The host's periodic + // sweep reads these to catch a browser that painted ≥1 frame then WEDGED (the first-paint + // watchdog retires at first paint, so post-establishment wedges had no detector). + var lastPresentNs: UInt64 = 0 + var livenessNudgedAt: UInt64 = 0 private weak var registry: FlutterTextureRegistry? private var width: Int diff --git a/packages/flutter_cef_macos/macos/Classes/LivenessProbePolicy.swift b/packages/flutter_cef_macos/macos/Classes/LivenessProbePolicy.swift new file mode 100644 index 0000000..7242afd --- /dev/null +++ b/packages/flutter_cef_macos/macos/Classes/LivenessProbePolicy.swift @@ -0,0 +1,31 @@ +// Pure decision policy for the STEADY-STATE liveness watchdog (F-6 / audit C-3): the +// backstop that catches a browser which painted at least once and then WEDGED (blank / +// frozen) with no other detector — the first-present watchdog retires at first paint, so +// post-establishment wedges were previously silent until relaunch. +// +// A static page legitimately produces NO presents when idle, so staleness alone is not a +// wedge. `nudge` (an opInvalidate) is the discriminator: a healthy page repaints (a present +// arrives, the caller clears the nudge); a wedged page doesn't, and after the grace we +// `declareStalled` → onPaintStalled → the consumer's existing BOUNDED recover(). +// +// Dependency-light (Swift stdlib only) → unit-testable standalone with `swiftc`: +// ./test/run_liveness_probe_tests.sh +import Foundation + +enum LivenessProbePolicy { + enum Action: Equatable { case healthy, nudge, declareStalled } + + /// Decide what the sweep should do for ONE established, visible, not-first-paint-pending + /// browser. The caller resets `nudged=false` (and refreshes `sinceLastPresentNs≈0`) the + /// instant ANY present arrives, so reaching the post-nudge branch means no present since. + /// - sinceLastPresentNs: now − the browser's last present. + /// - nudged / sinceNudgeNs: whether an opInvalidate is outstanding, and how long ago. + static func evaluate(sinceLastPresentNs: UInt64, stalenessThresholdNs: UInt64, + nudged: Bool, sinceNudgeNs: UInt64, nudgeGraceNs: UInt64) -> Action { + if sinceLastPresentNs < stalenessThresholdNs { return .healthy } // painted recently + if !nudged { return .nudge } // stale → discriminate + // Stale AND already nudged with no present since: wedged once the grace elapses; + // otherwise keep waiting for the nudge to land a frame. + return sinceNudgeNs >= nudgeGraceNs ? .declareStalled : .healthy + } +} diff --git a/packages/flutter_cef_macos/test/LivenessProbePolicyTests.swift b/packages/flutter_cef_macos/test/LivenessProbePolicyTests.swift new file mode 100644 index 0000000..25e25dc --- /dev/null +++ b/packages/flutter_cef_macos/test/LivenessProbePolicyTests.swift @@ -0,0 +1,62 @@ +// Standalone unit tests for LivenessProbePolicy — the F-6 steady-state liveness +// watchdog decision (catch a painted-then-wedged browser; discriminate a healthy idle +// static page via a nudge before declaring a stall). Swift stdlib only, so it compiles + +// runs with `swiftc` alone (no Xcode/pod harness/Campus): +// ./test/run_liveness_probe_tests.sh +import Foundation + +@main +enum LivenessProbePolicyTests { + static var failures = 0 + static func check(_ name: String, _ cond: Bool) { + print((cond ? " PASS " : " FAIL ") + name) + if !cond { failures += 1 } + } + + static let staleness: UInt64 = 10_000_000_000 // 10s + static let grace: UInt64 = 3_000_000_000 // 3s + + static func act(sinceLastPresentNs: UInt64, nudged: Bool = false, + sinceNudgeNs: UInt64 = 0) -> LivenessProbePolicy.Action { + LivenessProbePolicy.evaluate( + sinceLastPresentNs: sinceLastPresentNs, stalenessThresholdNs: staleness, + nudged: nudged, sinceNudgeNs: sinceNudgeNs, nudgeGraceNs: grace) + } + + static func main() { + // Recently painted (incl. a live 60fps tile) → leave it alone. + check("painted just now → healthy", act(sinceLastPresentNs: 0) == .healthy) + check("painted 5s ago (< staleness) → healthy", + act(sinceLastPresentNs: 5_000_000_000) == .healthy) + + // Stale + not yet nudged → discriminate (a healthy idle static page repaints; a wedged + // one does not). NOT a stall yet — this is the key "don't false-fire on idle" guard. + check("stale, not nudged → NUDGE (discriminate, not stall)", + act(sinceLastPresentNs: 12_000_000_000) == .nudge) + + // Nudged, present came back (caller resets sinceLastPresent≈0 + nudged=false) → healthy. + check("nudge landed a frame → healthy", + act(sinceLastPresentNs: 0, nudged: false) == .healthy) + + // Nudged, still stale, grace not elapsed → keep waiting (don't declare yet). + check("nudged, within grace → wait (healthy)", + act(sinceLastPresentNs: 12_000_000_000, nudged: true, sinceNudgeNs: 1_000_000_000) + == .healthy) + + // Nudged, still stale, grace elapsed with no present → WEDGED. + check("nudged, grace elapsed, no present → declareStalled", + act(sinceLastPresentNs: 14_000_000_000, nudged: true, sinceNudgeNs: 4_000_000_000) + == .declareStalled) + + // Boundary: exactly at the staleness threshold is still healthy (strict <). + check("exactly at staleness → still healthy", + act(sinceLastPresentNs: staleness) == .nudge) // >= threshold → nudge + check("one ns under staleness → healthy", + act(sinceLastPresentNs: staleness - 1) == .healthy) + + print(failures == 0 + ? "\nALL LivenessProbePolicy TESTS PASSED" + : "\n\(failures) LivenessProbePolicy TEST(S) FAILED") + exit(failures == 0 ? 0 : 1) + } +} diff --git a/packages/flutter_cef_macos/test/run_liveness_probe_tests.sh b/packages/flutter_cef_macos/test/run_liveness_probe_tests.sh new file mode 100755 index 0000000..8fc8e48 --- /dev/null +++ b/packages/flutter_cef_macos/test/run_liveness_probe_tests.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# Compile + run the standalone LivenessProbePolicy unit tests (F-6 steady-state liveness +# decision). Swift stdlib only — no Xcode/pod harness needed. +set -euo pipefail +DIR="$(cd "$(dirname "$0")/.." && pwd)" +OUT="$(mktemp -d)/liveness" +swiftc "$DIR/macos/Classes/LivenessProbePolicy.swift" "$DIR/test/LivenessProbePolicyTests.swift" -o "$OUT" +"$OUT" From 6e3e3b5bd245fbe503002787d158bef2bba22faf Mon Sep 17 00:00:00 2001 From: wenkaifan0720 Date: Fri, 26 Jun 2026 19:28:04 -0700 Subject: [PATCH 04/13] =?UTF-8?q?fix(osr):=20size-tagged=20present=20+=20s?= =?UTF-8?q?ize-gated=20promotion=20=E2=80=94=20kill=20zoom=20scale-mismatc?= =?UTF-8?q?h=20(too=20big/small/freeze)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause (full analysis in docs/OSR_SCALE_MISMATCH.md): the present protocol carried only a surface id, not the painted dims. On a device-scale (zoom) resize the host swaps to the new-size surface SYNCHRONOUSLY while the renderer re-rasters ASYNC, so the FIRST present after a resize is the renderer's OLD-scale frame landing in the NEW surface. With only a sid the consumer couldn't tell that provisional wrong-scale frame from a correct one and promoted it → content rendered too big (zoom out: big src cropped) / too small (zoom in: small src top-left, stale margins), and froze there on a static page. F-1..F-6 fixed the cull wedge but never touched the blit/promotion seam. - native SendPresentLocked(srcW,srcH): kOpPresent payload 4→12 bytes — sid + the PHYSICAL dims of the frame actually composited (view_src IOSurface dims / OnPaint width,height). All three present sites (OnPaint, CompositeMetalLocked, CompositeSoftwareLocked) plumb them. - Swift handleFrame(opPresent): SIZE-GATED promotion — promote the pending (resized) surface ONLY when the present's dims match the new surface (round(logical*dpr), ±1). A pre-re-raster wrong-scale present advances nothing; Flutter keeps sampling the last correct-scale buffer (geometrically right, momentarily softer) until the re-rastered frame lands. During active zoom the tile lags-crisp; on settle it sharpens. Never wrong-scale, never frozen-wrong. - Swift resizeWatchdog: no longer force-promotes — that could only promote a surface the size gate just refused (wrong-scale/blank). It now only re-kicks a dropped frame; the size-gated promotion + the 16ms pump land the correct frame, and F-6 recovers a genuine wedge. The F-4 hidden-promote guard is subsumed (no force-promote at all). cef_host compiles. Recommendation (analysis): KEEP per-zoom device-scale resize hardened — not fixed-max-density (~9x VRAM) or page-zoom (reflows). The model is sound; size-blind promotion was the bug. --- docs/OSR_SCALE_MISMATCH.md | 79 +++++++++++++++++++ .../macos/Classes/CefWebSession.swift | 63 +++++++-------- .../flutter_cef_macos/native/cef_host/main.mm | 44 ++++++++--- 3 files changed, 141 insertions(+), 45 deletions(-) create mode 100644 docs/OSR_SCALE_MISMATCH.md diff --git a/docs/OSR_SCALE_MISMATCH.md b/docs/OSR_SCALE_MISMATCH.md new file mode 100644 index 0000000..8a36f0a --- /dev/null +++ b/docs/OSR_SCALE_MISMATCH.md @@ -0,0 +1,79 @@ +# CEF zoom scale-mismatch + frozen-but-interactable: root cause, fix, recommendation + +All three traces converge on the same mechanism and I verified every load-bearing anchor in the actual source. The bug is **one defect with two faces**: a synchronous host-surface swap raced against an asynchronous renderer re-raster, mediated by a present protocol that carries only a surface id and therefore cannot tell a *provisional old-scale* frame from a *correct new-scale* frame. + +--- + +## 1. Root cause (ranked) + +### R1 — "too big / too small within bounds": scale-blind blit + size-blind promotion of a pre-re-raster frame (PRIMARY) + +The blit is the only place src and dst scale couple, and it does **min()-from-top-left with no scale check and no margin clear**: + +- `CompositeMetalLocked` (`flutter_cef_fix/.../cef_host/main.mm:731`): `cw=min(sw,dw), ch=min(sh,dh)`, copy origin(0,0)→origin(0,0), no scaling (`main.mm:734-742`). Software path `CompositeSoftwareLocked` and `BlitBGRA` (`main.mm:508-523`, `OnPaint` at `585`) are identical; `BlitBGRA`'s own comment already admits "CEF may deliver a frame at the pre-resize size while a resize is in flight." +- On a settled-zoom dpr change, `DoResize` (`main.mm:1410-1462`) swaps `slot->surface`/`width`/`height`/`dpr` **synchronously** under `surface_mutex` (`1430-1437`), drops the `dst_mtl` cache (`1438-1440`), then for a visible slot calls `NotifyScreenInfoChanged()` + `WasResized()` + one `SendExternalBeginFrame()` (`1445-1450`). Those only *post* a relayout; Blink re-rasters at the new `device_scale` **async**. `GetScreenInfo` already reports the new `slot->dpr` (`main.mm:547`) and `GetViewRect` the unchanged logical w/h (`main.mm:540`). +- So the immediate begin-frame (and the 16ms `PumpBeginFrame` ticks) pull a frame the renderer still rastered at the **old** device-scale → `src = logical*dpr_old`, `dst = logical*dpr_new`: + - **dpr ↑ (zoom in):** small src lands top-left of the bigger zero-filled dst; Flutter samples the whole dst into the logical box → **content too small + stale/black margins** (nothing clears the uncovered region). + - **dpr ↓ (zoom out):** bigger src clipped to dst's top-left `dw×dh`, stretched to fill the box → **content too big / cropped**. +- The wrong frame is **structurally the one promoted**: `SendPresentLocked` (`main.mm:558-565`) tags the present with only the new 4-byte surface id; Swift `handleFrame(opPresent)` (`CefWebSession.swift:620-631`) promotes `pendingBuffer→pixelBuffer` and clears `resizeInFlight` on the **first** present whose tag matches `pendingSurfaceId`, with **no size check**. The first post-resize present is the pre-re-raster old-scale frame. + +This is ordering, not a dropped notify — `NotifyScreenInfoChanged`/`WasResized` do fire (`main.mm:1445-1446`); they just don't raster synchronously. The F-1..F-6 cull fixes removed the hidden-path wedge but never touched the blit/promotion seam, which is why the symptom went transient-but-persistent rather than away. + +### R2 — "blank / freeze but interactable": wrong frame promoted with no guaranteed correct follow-up; watchdog force-promotes stale/zero pixels + +Input keeps working because event routing is independent of paint; only the texture is wedged. It persists when no correct frame lands after the promotion: + +- **Static page** (flutter.dev, idle agent_ui): one `_quantizedZoom` flip → exactly one `resize` → ~one paint. If that paint is old-scale and the compositor returns `DidNotProduceFrame` to subsequent pump ticks, the mis-scaled frame is the **last one ever produced** — frozen at wrong scale. +- **`resizeWatchdog` force-promote** (`CefWebSession.swift:303-347`, promote `319-330`): after 300ms it promotes whatever sits in `slot->surface` with **no scale check**. If the renderer hasn't re-rastered, that's old-scale content, or a **zero-filled** surface → blank-but-interactable; then `resizeInFlight=false`, `maybeSendNextResize` sends nothing. +- **Chained zoom**: `maybeSendNextResize` (`349-358`) swaps to step N+1's surface as soon as step N's lagging present promotes, so step N's correct frame arrives mis-tagged and is dropped → every promoted frame is one device-scale step behind until zoom stops. +- **Establishment race**: `DoResize` skips the screen-info re-assert when `slot->browser==null` (`main.mm:1442` guard → `needs_screen_info_on_show`), and `OnAfterCreated` re-asserts only close/visible, not geometry — a dpr resize during async create leaves the renderer at create-time dpr vs a new-dpr surface with nothing forcing a re-raster. +- Dart can't self-heal: `_ensureSession` writes `_lastDpr` **before** issuing the resize (`work_canvas_agentui_test/.../cef_web_view.dart:255-262`), so a coalesced/superseded resize is never re-issued; recovery depends entirely on native watchdog / F-6. + +### R3 — F-6 is blind to "frozen-but-presenting" (lets R2 persist; YES, confirmed) + +Every blit ends in an unconditional `SendPresentLocked` (`main.mm:758`/`599`/software), **including mis-scaled and blank-painted frames**. The present reader bumps `lastPresentNs` and clears `livenessNudgedAt` on *every* present (`CefProfileHost.swift:1147-1153`). The F-6 sweep (`785-825`) only escalates on **total** staleness (10s). So: +- A wrong-frame tile that then **idles** eventually trips F-6 → `opInvalidate` → re-raster at the now-correct scale (can recover). +- A wrong-frame tile that **keeps presenting** (animating agent_ui rAF, or the watchdog/nudge re-presenting) refreshes `lastPresentNs` forever → F-6 **never fires** → permanent wrong scale. F-6 detects "no pixels," never "wrong pixels." + +### Key unused oracle +`OnAcceleratedPaint` (`main.mm:779-812`) reads only `info.shared_texture_io_surface` and **ignores `info.extra`** (`cef_accelerated_paint_info_common_t`: `coded_size`/`visible_rect`/`content_rect`/`source_size`). The pooled `view_src` IOSurface can even be larger than the actually-painted content, so `IOSurfaceGetWidth(view_src)` is not the true rastered size — `visible_rect`/`content_rect` is. `info.extra` is exactly the "did the renderer raster at the new device-scale yet" signal the guard needs. + +--- + +## 2. The complete robust fix + +Four coordinated edits; **(1)+(2) are the core** and eliminate both too-big/too-small and the frozen-wrong-scale promotion. (3) is the freeze/animated backstop, (4) the safety/margin. + +### Fix 1 — Size-gated promotion (kills wrong-scale promotion at the source) — **native + Swift** +- **native** `SendPresentLocked` (`main.mm:558-565`): extend the `kOpPresent` payload from 4→12 bytes: `sid` + the **actually-composited src physical `w,h`** (from `info.extra.visible_rect`, falling back to `content_rect`/`coded_size`, then `IOSurfaceGetWidth/Height(view_src)`). Plumb those dims from `OnAcceleratedPaint`/`OnPaint` into the composite fns. +- **Swift** `handleFrame(opPresent)` (`CefWebSession.swift:620-631`): promote `pendingBuffer` **only** when `psid==pendingSurfaceId` **AND** `srcW≈round(width*dpr) && srcH≈round(height*dpr)` (±1px). A mis-scaled present advances nothing and leaves `resizeInFlight=true`. This turns "promote on first present of the new sid" into "promote on first **correctly-sized** present," so Flutter keeps sampling the old, geometrically-correct (slightly soft) buffer until the real frame lands — coherent with the existing resize-flash design (`CefWebSession.swift:265-273` already serves the old surface until the pending one paints). + +### Fix 2 — Guarantee a correct final frame (kills static-page freeze + establishment race) — **native** +- `DoResize` on `dpr_changed` (`main.mm:1434-1461`): set a per-slot `awaiting_scale = {round(w*dpr), round(h*dpr)}`. `OnAcceleratedPaint` clears it and presents **only** when the `info.extra` dims equal the awaited dst; while it's set, keep driving `Invalidate(PET_VIEW)` + `SendExternalBeginFrame` (a short bounded pump burst, mirroring the F-1 un-hide forced repaint) so a static page that emits one frame is deterministically re-driven to the settled scale. Don't rely on the single synchronous `SendExternalBeginFrame` at `1450` racing the async relayout. +- `OnAfterCreated` (~`main.mm:1038-1060`): if slot dims/dpr differ from create-time (a resize arrived during async create), call `NotifyScreenInfoChanged()` + `WasResized()` + `SendExternalBeginFrame()` there to fix the establishment race (today only `needs_screen_info_on_show` covers the hidden case). + +### Fix 3 — Blit guard + margin clear (no blank/garbage under the watchdog) — **native + Swift** +- **native** `CompositeMetalLocked`/`CompositeSoftwareLocked`/`OnPaint PET_VIEW` (`main.mm:646`, `693`, `584`): when `src != dst` dims, still blit the `min` rect (so the surface isn't blank for the watchdog backstop) **but clear the uncovered dst region** outside `[0,cw)×[0,ch)` (Metal clear / memset), and **do not** call `SendPresentLocked` for that provisional frame (pairs with Fix 1's gate). `dst_mtl` cache needs no change — `DoResize` already invalidates it unconditionally (`main.mm:1438-1440`), so it is **not** a contributor. +- **Swift** `resizeWatchdog` force-promote (`CefWebSession.swift:319-330`): gate force-promotion on "the pending surface has received an exact-dims frame"; otherwise keep waiting/nudging. Never force-promote stale-scale or zero-filled content. + +### Fix 4 — Liveness detects frozen-but-presenting — **Swift** +With Fix 1, mis-scaled presents no longer reach the heartbeat, so `lastPresentNs` advances only on correct frames and F-6's existing staleness path (`CefProfileHost.swift:785-825`) fires naturally → `opInvalidate` → forced correct re-raster. Belt-and-suspenders: track a separate `lastCorrectScalePresentNs` (set only on a size-matched present in the reader at `1147-1153`) and feed **that** to `LivenessProbePolicy.evaluate`; treat a nudge whose repaint returns still-mismatched as *escalate*, not *reset*. + +### Campus side +**No change required.** `cefRenderScale = round(dpr*clamp(zoom,1,3)*4)/4` (`platform_view_live_mode.dart:162-165`) and the `renderScale` pass-through (`cef_webview_tile.dart:821`, `agent_ui_tile.dart:2038`) are correct — they produce the right *value*; the realloc that value drives is the race trigger, not a value bug. One optional hardening: in `cef_web_view.dart:255-262`, write `_lastDpr`/`_lastSize` **after** the resize is acknowledged (or re-issue on a superseded resize) so Dart can re-drive a coalesced resize instead of relying solely on the native watchdog. + +--- + +## 3. Recommendation: KEEP per-zoom device-scale resize (hardened) — do not switch + +This is the second fragility round on the same mechanism, so the design question is fair. Verdict: **ship the hardened resize now (Fixes 1-4); treat fixed-density as a scoped, optional follow-up; reject page-zoom outright.** + +| Option | What it does | Verdict | +|---|---|---| +| **(a) Per-zoom resize + size-gated promotion + blit guard + forced final frame** | Keeps `device_scale = dpr*clamp(zoom,1,3)`; makes promotion size-correct and guarantees a settled frame | **SHIP THIS.** Smallest surgical change; preserves crispness and the **bounded** (clamp 3×) IOSurface that the cull/memory budget depends on. Robust *because* Fix 1 makes promotion size-correct rather than racing it. | +| **(b) Fixed max-density surface** — allocate once at `logical*dpr*3`, never realloc on zoom; Flutter texture supersamples down | Deletes the resize race, the `resizeWatchdog`, and the hidden-promote special-casing for the zoom path entirely | **Strategic follow-up, scoped only to the engaged/foreground tile.** Cost is constant ~9× VRAM+raster per tile even at zoom 1 (≈54MB for a 720×520 tile), which directly fights the establishment/memory budget the cull fixes protect. A *global* fixed ceiling is too costly for many-tile boards; combined with the existing `WasHidden` off-screen gate it's acceptable for a handful of visible tiles. The durable target *if* (a) proves insufficient under heavy pages. | +| **(c) CEF `SetZoomLevel` / CSS page-zoom** (`DoSetZoom`, `main.mm:1500-1502`) | Changes content zoom → page **reflows** (layout/text/breakpoints) | **REJECT for crispness.** Canvas zoom must magnify rendered pixels while preserving layout; `device_scale_factor` is the correct knob. Keep `SetZoomLevel` only for user-facing Ctrl+/- content zoom. | + +**Why not just lean on liveness:** F-6 is a *freeze* backstop by construction (it watches present cadence, not pixels) and structurally cannot see "presenting but visually wrong." The fix must live at the **blit/promotion seam** (Fixes 1-3); liveness (Fix 4) is only the secondary net once correct presents are the only thing that counts as a heartbeat. + +**Net:** the renderScale-via-resize model is *sound for crispness* but *racy by construction* — every device-scale change re-runs sync-dst-swap vs async-src-raster mediated by a sid-only present. Fix the race with a size-tagged present + guaranteed final frame rather than abandoning the bounded-memory resize. Plan (b) for the single engaged tile as the race-eliminating simplification if heavy pages still slip through; never adopt (c). diff --git a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift index ec06e22..721b1b3 100644 --- a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift +++ b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift @@ -303,43 +303,20 @@ final class CefWebSession: NSObject, FlutterTexture { private func resizeWatchdog(_ gen: UInt64) { bufferLock.lock() let isHidden = hidden - // F-4 (gating extracted to ResizeWatchdogPolicy for standalone unit tests): never - // force-promote while hidden — the pending surface is zero-filled (the gated pump never - // painted it), so promoting it wedges the texture permanently blank. Wait instead; the - // native hidden->visible repaint (F-1) drives a real present that promotes the pending - // buffer through the normal present path. let active = ResizeWatchdogPolicy.shouldKeepWaiting( inFlight: resizeInFlight, gen: gen, currentGen: resizeGen) - let givenUp = ResizeWatchdogPolicy.shouldForcePromote( - inFlight: resizeInFlight, gen: gen, currentGen: resizeGen, - hidden: isHidden, elapsedNs: nowNs() &- resizeSentAtNs, thresholdNs: 300_000_000) - var promotedTid: Int64 = 0 - var promotedSid: UInt32 = 0 - var promotedW = 0, promotedH = 0 - if givenUp { - if let pending = pendingBuffer { - pixelBuffer = pending - promotedSid = pendingSurfaceId // capture before clearing - promotedW = width - promotedH = height - pendingBuffer = nil - pendingSurfaceId = 0 - promotedTid = textureId - } - resizeInFlight = false - } bufferLock.unlock() - if givenUp { - if promotedTid != 0 { registry?.textureFrameAvailable(promotedTid) } - // R2: force-promoted a resized surface — notify WebRTC consumers (same as opPresent). - if promotedSid != 0 { notifySurface(promotedSid, promotedW, promotedH) } - maybeSendNextResize() - return - } + // The SIZE-GATED promotion in handleFrame is now the ONLY promoter: it refuses a present + // whose composited dims don't match the new surface, so the watchdog must NOT force-promote + // — that would show the renderer's pre-re-raster WRONG-SCALE frame (too big/small) or a + // blank surface (the old behavior + F-4 hidden-guard are superseded by the size gate). If a + // correct frame already landed, handleFrame cleared resizeInFlight and `active` is false → + // stop. Otherwise re-kick a possibly-dropped frame (the 16ms begin-frame pump also drives + // the re-raster); F-6 liveness recovers a genuinely wedged tile. The texture meanwhile keeps + // the last correct-scale buffer (geometrically right, momentarily softer) — never wrong. guard active else { return } - // While hidden, opInvalidate can't paint (the pump is gated) — skip the nudge but keep - // the watchdog alive so it resumes promoting once visible; the un-hide repaint promotes - // via the present path first, after which resizeInFlight clears and this self-terminates. + // While hidden the pump is gated off, so opInvalidate can't paint — skip the nudge but keep + // the watchdog alive; the native un-hide repaint (F-1) drives a real present that promotes. if !isHidden { sendFrame(Self.opInvalidate, []) } DispatchQueue.main.asyncAfter(deadline: .now() + 0.08) { [weak self] in self?.resizeWatchdog(gen) @@ -617,14 +594,28 @@ final class CefWebSession: NSObject, FlutterTexture { // blank new one. A present for the old/current surface just advances the frame. var promotedSid: UInt32 = 0 var promotedW = 0, promotedH = 0 - if payload.count >= 4 { + if payload.count >= 12 { let psid = (UInt32(payload[0]) << 24) | (UInt32(payload[1]) << 16) | (UInt32(payload[2]) << 8) | UInt32(payload[3]) - if let pending = pendingBuffer, psid != 0, psid == pendingSurfaceId { + let srcW = Int((UInt32(payload[4]) << 24) | (UInt32(payload[5]) << 16) + | (UInt32(payload[6]) << 8) | UInt32(payload[7])) + let srcH = Int((UInt32(payload[8]) << 24) | (UInt32(payload[9]) << 16) + | (UInt32(payload[10]) << 8) | UInt32(payload[11])) + // SIZE-GATED PROMOTION: only promote the pending (resized) surface when the present's + // COMPOSITED frame dims match the new surface (round(logical*dpr)). On a device-scale + // (zoom) resize the host swaps to the new surface synchronously while the renderer + // re-rasters async, so the FIRST present after the resize carries the renderer's + // OLD-scale frame in the new surface — promoting it renders too big/small (and can + // freeze there). Gating on dims keeps Flutter sampling the last correct-scale buffer + // (geometrically right, momentarily softer) until the re-rastered frame lands. + let expW = Int((Double(width) * dpr).rounded()) + let expH = Int((Double(height) * dpr).rounded()) + let scaleOk = abs(srcW - expW) <= 1 && abs(srcH - expH) <= 1 + if let pending = pendingBuffer, psid != 0, psid == pendingSurfaceId, scaleOk { pixelBuffer = pending pendingBuffer = nil pendingSurfaceId = 0 - resizeInFlight = false // its paint landed; free to send the next size + resizeInFlight = false // its CORRECT-SCALE paint landed; free to send the next size promotedSid = psid promotedW = width promotedH = height diff --git a/packages/flutter_cef_macos/native/cef_host/main.mm b/packages/flutter_cef_macos/native/cef_host/main.mm index 7fcc429..515223b 100644 --- a/packages/flutter_cef_macos/native/cef_host/main.mm +++ b/packages/flutter_cef_macos/native/cef_host/main.mm @@ -555,13 +555,26 @@ bool GetScreenInfo(CefRefPtr, CefScreenInfo& info) override { // Flutter texture only once a paint into THAT surface has actually landed — until // then it keeps serving the old surface, so a resize never flashes the fresh, // zero-filled IOSurface. Caller holds slot_->surface_mutex. - void SendPresentLocked() { + // The present carries the SID of the surface presented AND the PHYSICAL pixel dims of the + // frame that was actually composited into it (srcW/srcH). On a device-scale (zoom) resize + // the host swaps to the new-size surface synchronously while the renderer re-rasters + // async, so the first frame after a resize is the renderer's OLD-scale frame landing in + // the NEW surface. With only a sid the consumer can't tell that provisional wrong-scale + // frame from a correct one and promotes it → content renders too big/small. Carrying the + // composited dims lets the consumer promote ONLY a frame whose dims match the new surface + // (round(logical*dpr)), so it keeps serving the last correct-scale buffer until the real + // re-rastered frame lands. UI thread; caller holds slot_->surface_mutex. + void SendPresentLocked(int srcW, int srcH) { uint32_t sid = slot_->surface ? IOSurfaceGetID(slot_->surface) : 0; - uint8_t p[4] = {static_cast((sid >> 24) & 0xff), - static_cast((sid >> 16) & 0xff), - static_cast((sid >> 8) & 0xff), - static_cast(sid & 0xff)}; - SendFrame(slot_->browser_id, kOpPresent, p, 4); + auto be32 = [](uint8_t* o, uint32_t v) { + o[0] = (v >> 24) & 0xff; o[1] = (v >> 16) & 0xff; + o[2] = (v >> 8) & 0xff; o[3] = v & 0xff; + }; + uint8_t p[12]; + be32(p, sid); + be32(p + 4, static_cast(srcW < 0 ? 0 : srcW)); + be32(p + 8, static_cast(srcH < 0 ? 0 : srcH)); + SendFrame(slot_->browser_id, kOpPresent, p, 12); } void OnPaint(CefRefPtr, PaintElementType type, const RectList&, @@ -596,7 +609,10 @@ void OnPaint(CefRefPtr, PaintElementType type, const RectList&, popup_py); } IOSurfaceUnlock(slot_->surface, 0, nullptr); - SendPresentLocked(); + // PET_VIEW reports the painted view-frame dims (the size-gate signal); a PET_POPUP + // repaint didn't rescale the view, so report the surface dims (always correct-scale). + SendPresentLocked(type == PET_VIEW ? width : surf_w, + type == PET_VIEW ? height : surf_h); } void OnPopupShow(CefRefPtr browser, bool show) override { @@ -650,9 +666,15 @@ void CompositeSoftwareLocked(IOSurfaceRef view_src) { const size_t ds = IOSurfaceGetBytesPerRow(slot_->surface); const int dw = static_cast(IOSurfaceGetWidth(slot_->surface)); const int dh = static_cast(IOSurfaceGetHeight(slot_->surface)); + // The composited frame's physical dims (for the size-gated present). A popup-only + // repaint (view_src == null) re-presents the existing view surface as-is → report the + // surface dims so it counts as correct-scale. + int srcW = dw, srcH = dh; if (view_src && IOSurfaceLock(view_src, kIOSurfaceLockReadOnly, nullptr) == kIOReturnSuccess) { + srcW = static_cast(IOSurfaceGetWidth(view_src)); + srcH = static_cast(IOSurfaceGetHeight(view_src)); const auto* s = static_cast(IOSurfaceGetBaseAddress(view_src)); const size_t ss = IOSurfaceGetBytesPerRow(view_src); const int rows = std::min(dh, IOSurfaceGetHeight(view_src)); @@ -671,7 +693,7 @@ void CompositeSoftwareLocked(IOSurfaceRef view_src) { slot_->popup_h, px, py); } IOSurfaceUnlock(slot_->surface, 0, nullptr); - SendPresentLocked(); + SendPresentLocked(srcW, srcH); } // GPU-blit composite: copy CEF's accelerated view surface into the host-owned slot_->surface @@ -693,6 +715,10 @@ void CompositeSoftwareLocked(IOSurfaceRef view_src) { void CompositeMetalLocked(IOSurfaceRef view_src) { if (!slot_->surface) return; bool blitted = false; + // Composited frame's physical dims (for the size-gated present) — captured at function + // scope since `sw/sh` below are inside the @autoreleasepool. + const int srcW = view_src ? static_cast(IOSurfaceGetWidth(view_src)) : 0; + const int srcH = view_src ? static_cast(IOSurfaceGetHeight(view_src)) : 0; if (view_src && EnsureMetal()) { @autoreleasepool { const int sw = static_cast(IOSurfaceGetWidth(view_src)); @@ -755,7 +781,7 @@ void CompositeMetalLocked(IOSurfaceRef view_src) { logged = true; SendLog(slot_->browser_id, "present: GPU Metal blit path active"); } - SendPresentLocked(); + SendPresentLocked(srcW, srcH); } else { static bool loggedFb = false; if (!loggedFb) { From 9b75a3c7160ff4e5b0f5a3b12fab9d550babf7b8 Mon Sep 17 00:00:00 2001 From: wenkaifan0720 Date: Fri, 26 Jun 2026 20:08:32 -0700 Subject: [PATCH 05/13] =?UTF-8?q?test(osr):=20zoom-resize=20soak=20probe?= =?UTF-8?q?=20+=20present=20size=20diag=20=E2=80=94=20proves=20the=20size-?= =?UTF-8?q?gate=20under=20soak?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Soak-proven on the flutter_cef side (real CEF, single-view) that the size-gated promotion works and does NOT degrade: cycling renderScale (dpr) hammers the resize path; the [cefdiag-resize] log shows, per present while a resize is pending, the actual composited src dims vs the expected new-surface dims (round(logical*dpr)) and whether they match. Verbatim soak evidence (dpr sweep): each resize logs `src=OLD exp=NEW match=false` (the renderer's lagging frame — correctly NOT promoted) then `src=NEW exp=NEW match=true` (the re-rastered frame — promoted). 38 match=true / 17 match=false over the run, still matching 17s in → no stick/degradation. Confirms src (view_src dims) is the true frame size (never pool-sized), so the size-gate is sound. - example/lib/zoom_soak_probe.dart: the soak harness (auto-cycles renderScale, ticking clock + fixed-proportion box make freeze/wrong-scale obvious). - CefWebSession: [cefdiag-resize] present-size diagnostic (behind FLUTTER_CEF_DEBUG). - CefProfileHost: explicit UInt64(0) in the F-6 ternary (the example's fresh pod surfaced a type-inference error the Campus build had masked). NOTE: the prior Campus "scale-fix" builds were broken by a build-infra bug, not the fix — `make cef-host`'s up-to-date gate keys on the flutter_cef path stamp, not main.mm's mtime, so local native edits were silently not recompiled (old 4-byte present vs new 12-byte parser → promotion never fired). Force a clean cef_host rebuild (rm the .flutter_cef_ref stamp) after any native edit. --- example/lib/zoom_soak_probe.dart | 131 ++++++++++++++++++ .../macos/Classes/CefProfileHost.swift | 2 +- .../macos/Classes/CefWebSession.swift | 9 ++ 3 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 example/lib/zoom_soak_probe.dart diff --git a/example/lib/zoom_soak_probe.dart b/example/lib/zoom_soak_probe.dart new file mode 100644 index 0000000..4e51459 --- /dev/null +++ b/example/lib/zoom_soak_probe.dart @@ -0,0 +1,131 @@ +// Zoom-resize SOAK probe — hammers the device-scale (renderScale) resize path the way a +// user repeatedly zooming the canvas does, to expose the "slowly degrades, freezes at wrong +// size, never recovers" failure. It auto-cycles renderScale across the 1×–3× band every +// 500ms (each step is a dpr resize → new IOSurface + re-raster). The page is a full-bleed +// gradient with a ticking clock + a centered fixed-proportion box: a FROZEN tile stops the +// clock; a WRONG-SCALE tile shows the box too big/small or offset. +// +// Run FLUTTER_CEF_DEBUG=1 and watch the [cefdiag-resize] lines: they print, per present +// while a resize is pending, the actual composited src dims vs the expected new-surface dims +// and whether they match — the oracle for whether the size-gate can ever promote (if src is +// pool-sized, match is never true and the resize STICKS). +// +// FLUTTER_CEF_HOST= FLUTTER_CEF_ALLOW_INSECURE_PROFILE=1 FLUTTER_CEF_DEBUG=1 \ +// flutter run -d macos -t lib/zoom_soak_probe.dart +import 'dart:async'; +import 'package:flutter/material.dart'; +import 'package:flutter_cef/flutter_cef.dart'; + +const _html = ''' + + +
+
DENSITY SOAK
+
+
clock = fresh · box = correct scale
+
+ +'''; + +// The renderScale (device-scale) sweep — each value is a dpr resize. +const _scales = [2.0, 2.5, 3.0, 4.0, 5.0, 6.0, 5.0, 4.0, 3.0, 2.5]; + +void main() => runApp(const SoakApp()); + +class SoakApp extends StatefulWidget { + const SoakApp({super.key}); + @override + State createState() => _SoakAppState(); +} + +class _SoakAppState extends State { + final _controller = CefWebController(); + int _i = 0; + int _cycles = 0; + Timer? _timer; + bool _running = true; + + @override + void initState() { + super.initState(); + _controller.onPageStarted = (_) => _controller.loadHtmlString(_html); + // Start soaking a few seconds after first paint. + Future.delayed(const Duration(seconds: 4), _startSoak); + } + + void _startSoak() { + _timer = Timer.periodic(const Duration(milliseconds: 500), (_) { + if (!mounted || !_running) return; + setState(() { + _i = (_i + 1) % _scales.length; + if (_i == 0) _cycles++; + }); + }); + } + + @override + void dispose() { + _timer?.cancel(); + _controller.dispose(); + super.dispose(); + } + + @override + Widget build(BuildContext context) { + final scale = _scales[_i]; + return MaterialApp( + debugShowCheckedModeBanner: false, + home: Scaffold( + backgroundColor: const Color(0xFF111722), + body: Column( + children: [ + Container( + width: double.infinity, + color: const Color(0xFF0B1220), + padding: const EdgeInsets.symmetric(horizontal: 12, vertical: 8), + child: Row( + children: [ + Expanded( + child: Text( + 'SOAK renderScale=${scale.toStringAsFixed(1)} ' + 'cycles=$_cycles step=$_i', + style: const TextStyle(color: Colors.white, fontSize: 13), + ), + ), + TextButton( + onPressed: () => setState(() => _running = !_running), + child: Text(_running ? 'pause' : 'resume', + style: const TextStyle(color: Colors.white)), + ), + ], + ), + ), + Expanded( + child: Center( + child: Container( + color: const Color(0xFF2A3340), + padding: const EdgeInsets.all(28), + // Fixed LOGICAL size; only renderScale (density) changes — the layout must + // stay identical, so any size change in the box is a scale bug. + child: SizedBox( + width: 400, + height: 300, + child: CefWebView( + url: 'about:blank', + controller: _controller, + renderScale: scale, + ), + ), + ), + ), + ), + ], + ), + ), + ); + } +} diff --git a/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift b/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift index dc7cdea..0fae4c8 100644 --- a/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift +++ b/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift @@ -803,7 +803,7 @@ final class CefProfileHost { let nudged = c.nudgedAt != 0 let action = LivenessProbePolicy.evaluate( sinceLastPresentNs: c.sinceLast, stalenessThresholdNs: livenessStalenessNs, - nudged: nudged, sinceNudgeNs: nudged ? (now &- c.nudgedAt) : 0, + nudged: nudged, sinceNudgeNs: nudged ? (now &- c.nudgedAt) : UInt64(0), nudgeGraceNs: livenessGraceNs) switch action { case .healthy: diff --git a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift index 721b1b3..2e2dcde 100644 --- a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift +++ b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift @@ -611,6 +611,15 @@ final class CefWebSession: NSObject, FlutterTexture { let expW = Int((Double(width) * dpr).rounded()) let expH = Int((Double(height) * dpr).rounded()) let scaleOk = abs(srcW - expW) <= 1 && abs(srcH - expH) <= 1 + // DIAG: while a resize is pending, log every present's actual composited dims vs the + // expected new-surface dims, so a soak test can see whether the size-match ever + // succeeds (if `view_src` is pool-sized, src never equals exp and the resize sticks). + if pendingBuffer != nil, + ProcessInfo.processInfo.environment["FLUTTER_CEF_DEBUG"] != nil { + NSLog("[cefdiag-resize] bid=\(browserId) src=\(srcW)x\(srcH) exp=\(expW)x\(expH) " + + "match=\(scaleOk) logical=\(width)x\(height) dpr=\(dpr) " + + "psid=\(psid) pendSid=\(pendingSurfaceId) sidMatch=\(psid == pendingSurfaceId)") + } if let pending = pendingBuffer, psid != 0, psid == pendingSurfaceId, scaleOk { pixelBuffer = pending pendingBuffer = nil From 64e4fea025f43f9243e0ae092ec03d9fd4e119c1 Mon Sep 17 00:00:00 2001 From: wenkaifan0720 Date: Fri, 26 Jun 2026 23:38:12 -0700 Subject: [PATCH 06/13] harden(osr): shared-host security + correctness + resource + render-floor pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Architecture-audit-driven hardening of the shared cef_host (many OSR browsers per named profile). KEY FINDING FIRST: the "multi-browser transparent render" that drove the audit was a BUG IN THE TEST PROBE, not flutter_cef — sharedhost_html_probe wrapped CefWebView in a shrink-wrapping Stack so the browser laid out at ~50px (the pixel oracle below caught it: surfaces were 66x30). With the layout fixed, a 6-browser shared host renders 6 full-size tiles with 6 distinct gradients (verified via the in-host pixel sampler, since the box was display-asleep). So the shared-host render path is sound; the bounded-pool refactor is NOT needed for render correctness. The remaining audit findings are from static analysis and are real — those are fixed here: SECURITY - OnQuery + InjectChannelShim now gate on frame->IsMain(): the privileged campusHost bridge ('ch:') and host-eval result channel ('eval:') were injected into / accepted from ALL frames incl. cross-origin iframes — an embedded untrusted iframe could drive the host reducer / forge eval results. Now main-frame-only (inject) + subframe-refused (dispatch). CORRECTNESS - DoCreateBrowser refuses a wire id already in g_slots_by_wire_id (kOpCreateFailed) instead of registering — a reuse would let the old browser's OnBeforeClose erase the new slot, leaving an unroutable browser + leaked IOSurface/dst_mtl. - pendingCreates now cleared in BOTH shutdown() and handleHostDeath() (browsersLock-guarded), symmetric with createSendQueue/createInFlight — a host dying between spawn and opReady no longer leaves the pre-opReady create closures dangling. - kOpLoadTrusted/kOpNavigate deferred by wire id to TID_UI (DoNavigateByWireId) + DoNavigate queues pending_nav_url if the browser isn't bound — a loadHtmlString right behind a queued create on a shared host was dropped (slot==null), the kOpAddChannel-class drop. RESOURCE - RLIMIT_NOFILE raised toward the hard cap at cef_host startup: a busy shared host's sockets/pipes/IOSurfaces reach macOS's 256 soft limit; ties to the WebRTC select() fd>=1024 fault on an fd-heavy campus. RENDER FLOOR / ROBUSTNESS - Opaque background_color so a missing/late frame reads as blank-white (loading-looking) instead of an invisible transparent ghost (the paints=0 establishment knock-out shows correctly now). OnLoadEnd does WasResized+Invalidate+SendExternalBeginFrame (the visibility-edge kick) instead of a coalesce-able Invalidate alone, so the loaded content is deterministically driven to composite. - [cefdiag] in-host pixel sampler (FLUTTER_CEF_DEBUG): classifies the renderer's frame content/white/clear so render correctness is verifiable WITHOUT a screenshot. VERIFIED: 6-tile shared-host probe renders 6 distinct gradients (K=1) / 5-6 (K=3, the rare loss is the documented watchdog-recovered concurrent-establishment knock-out, tunable via FLUTTER_CEF_ESTAB_WINDOW). ResizeWatchdogPolicy + LivenessProbePolicy standalone tests pass. Example soak probes added (interaction/realsite/recreate/sharedhost) for shared-host coverage. --- example/lib/cull_wedge_probe.dart | 15 ++ example/lib/interaction_soak_probe.dart | 174 ++++++++++++++++ example/lib/realsite_soak_probe.dart | 143 +++++++++++++ example/lib/recreate_soak_probe.dart | 195 ++++++++++++++++++ example/lib/sharedhost_html_probe.dart | 118 +++++++++++ example/macos/Podfile.lock | 4 +- .../macos/Classes/CefProfileHost.swift | 12 ++ .../flutter_cef_macos/native/cef_host/main.mm | 145 +++++++++++-- 8 files changed, 791 insertions(+), 15 deletions(-) create mode 100644 example/lib/interaction_soak_probe.dart create mode 100644 example/lib/realsite_soak_probe.dart create mode 100644 example/lib/recreate_soak_probe.dart create mode 100644 example/lib/sharedhost_html_probe.dart diff --git a/example/lib/cull_wedge_probe.dart b/example/lib/cull_wedge_probe.dart index 7ca0c35..64ae433 100644 --- a/example/lib/cull_wedge_probe.dart +++ b/example/lib/cull_wedge_probe.dart @@ -47,10 +47,25 @@ class _WedgeAppState extends State { bool _big = false; String _status = 'ready'; + int _autoCyclesDone = 0; + @override void initState() { super.initState(); _controller.onPageStarted = (_) => _controller.loadHtmlString(_html); + // Self-driving evidence run: a few seconds after first paint, run several wedge + // cycles back-to-back then settle SHOWN, so a screenshot of the final state proves the + // page repainted (F-1) rather than wedged blank — no clicking needed. + Future.delayed(const Duration(seconds: 4), _runAutoCycles); + } + + Future _runAutoCycles() async { + for (var i = 0; i < 4; i++) { + await _wedgeCycle(); + setState(() => _autoCyclesDone = i + 1); + await Future.delayed(const Duration(milliseconds: 900)); + } + setState(() => _status = 'auto: $_autoCyclesDone wedge cycles done — page MUST be visible'); } @override diff --git a/example/lib/interaction_soak_probe.dart b/example/lib/interaction_soak_probe.dart new file mode 100644 index 0000000..1d858d9 --- /dev/null +++ b/example/lib/interaction_soak_probe.dart @@ -0,0 +1,174 @@ +// Interaction SOAK probe — reproduces Campus's "degrades after interaction" by hitting all +// the things the single-tile zoom soak did NOT: MULTIPLE tiles on the shared host, CULL +// (setVisible false/true) interleaved with renderScale (dpr) changes, and LOGICAL tile +// resizes. The size-gate + cull + multi-tile interaction is the untested seam. +// +// Each of 4 tiles independently, on a rotating schedule, does one of: change renderScale, +// hide, show, change logical size. A tile that wedges shows blank / frozen clock / wrong +// scale and STAYS that way. With FLUTTER_CEF_DEBUG=1 the [cefdiag-resize] lines reveal a +// STUCK resize (repeated match=false for the same pendSid → the size-gate never promotes). +// +// FLUTTER_CEF_HOST= FLUTTER_CEF_ALLOW_INSECURE_PROFILE=1 FLUTTER_CEF_DEBUG=1 \ +// flutter run -d macos -t lib/interaction_soak_probe.dart +import 'dart:async'; +import 'package:flutter/material.dart'; +import 'package:flutter_cef/flutter_cef.dart'; + +String _html(int i) => ''' + +
+
TILE $i
+
+
+ +'''; +String _bg(int i) => const [ + '#2563eb,#7c3aed', '#db2777,#f59e0b', '#059669,#2563eb', '#7c3aed,#db2777' + ][i % 4]; + +const _scales = [2.0, 3.0, 4.0, 5.0, 6.0, 4.0]; + +void main() => runApp(const App()); + +class App extends StatefulWidget { + const App({super.key}); + @override + State createState() => _AppState(); +} + +class _Tile { + final CefWebController controller = CefWebController(); + double scale = 2.0; + bool visible = true; + bool big = false; + int scaleIdx = 0; +} + +class _AppState extends State { + final _tiles = List.generate(4, (_) => _Tile()); + Timer? _timer; + int _tick = 0; + bool _running = true; + + @override + void initState() { + super.initState(); + for (var i = 0; i < _tiles.length; i++) { + final t = _tiles[i]; + t.controller.onPageStarted = (_) => t.controller.loadHtmlString(_html(i)); + } + Future.delayed(const Duration(seconds: 5), () { + _timer = Timer.periodic(const Duration(milliseconds: 350), (_) => _step()); + }); + } + + // Each tick, drive ONE action on ONE tile — interleaving zoom / cull / resize across the + // 4 tiles on the shared host, the way real canvas interaction does. + void _step() { + if (!mounted || !_running) return; + final t = _tiles[_tick % _tiles.length]; + final action = (_tick ~/ _tiles.length) % 4; + setState(() { + switch (action) { + case 0: // zoom (renderScale / dpr) + t.scaleIdx = (t.scaleIdx + 1) % _scales.length; + t.scale = _scales[t.scaleIdx]; + case 1: // cull off (hide) + t.visible = false; + t.controller.setVisible(false); + case 2: // logical resize WHILE the tile may be hidden + t.big = !t.big; + case 3: // un-cull (show) — F-1 must repaint, size-gate must promote + t.visible = true; + t.controller.setVisible(true); + } + _tick++; + }); + } + + @override + void dispose() { + _timer?.cancel(); + for (final t in _tiles) { + t.controller.dispose(); + } + super.dispose(); + } + + @override + Widget build(BuildContext context) { + return MaterialApp( + debugShowCheckedModeBanner: false, + home: Scaffold( + backgroundColor: const Color(0xFF111722), + body: Column( + children: [ + Container( + width: double.infinity, + color: const Color(0xFF0B1220), + padding: const EdgeInsets.symmetric(horizontal: 12, vertical: 8), + child: Row(children: [ + Expanded( + child: Text('INTERACTION SOAK tick=$_tick', + style: const TextStyle(color: Colors.white, fontSize: 13)), + ), + TextButton( + onPressed: () => setState(() => _running = !_running), + child: Text(_running ? 'pause' : 'resume', + style: const TextStyle(color: Colors.white)), + ), + ]), + ), + Expanded( + child: GridView.count( + crossAxisCount: 2, + padding: const EdgeInsets.all(16), + mainAxisSpacing: 16, + crossAxisSpacing: 16, + children: [ + for (var i = 0; i < _tiles.length; i++) _tileView(i), + ], + ), + ), + ], + ), + ), + ); + } + + Widget _tileView(int i) { + final t = _tiles[i]; + final w = t.big ? 360.0 : 280.0; + final h = t.big ? 240.0 : 200.0; + return Container( + color: const Color(0xFF2A3340), + alignment: Alignment.center, + child: Stack( + alignment: Alignment.topLeft, + children: [ + SizedBox( + width: w, + height: h, + child: CefWebView( + url: 'about:blank', + controller: t.controller, + renderScale: t.scale, + ), + ), + Container( + color: Colors.black54, + padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 1), + child: Text( + 'T$i s=${t.scale.toStringAsFixed(0)} ${t.visible ? "vis" : "HID"} ${w.toInt()}w', + style: const TextStyle(color: Colors.white, fontSize: 9), + ), + ), + ], + ), + ); + } +} diff --git a/example/lib/realsite_soak_probe.dart b/example/lib/realsite_soak_probe.dart new file mode 100644 index 0000000..01bd6fe --- /dev/null +++ b/example/lib/realsite_soak_probe.dart @@ -0,0 +1,143 @@ +// Real-website SOAK probe — the missing variable: trivial HTML re-rasters in ~1 frame, so +// the size-gate's "wait for a correct-scale frame" always resolves instantly. A REAL site +// (heavy layout, async content, continuous paint) re-rasters SLOWLY, so RAPID renderScale +// (zoom) changes can outrun the re-raster — the renderer never produces a frame at the +// LATEST size, the size-gate never promotes, resizeInFlight sticks → freeze at wrong/old +// scale, "never recovers". This mixes a trivial page with heavy real sites and hammers +// renderScale fast, exactly that scenario. +// +// FLUTTER_CEF_HOST= FLUTTER_CEF_ALLOW_INSECURE_PROFILE=1 FLUTTER_CEF_DEBUG=1 \ +// flutter run -d macos -t lib/realsite_soak_probe.dart +import 'dart:async'; +import 'package:flutter/material.dart'; +import 'package:flutter_cef/flutter_cef.dart'; + +const _trivial = ''' + +
trivial —
+ +'''; + +// Tile 0 = trivial HTML; tiles 1-3 = heavy real sites (loaded via the URL prop, like a +// cefWebview tile — NOT loadHtmlString). +const _urls = [null, 'https://en.wikipedia.org/wiki/Web_browser', + 'https://flutter.dev', 'https://news.ycombinator.com']; + +const _scales = [2.0, 2.5, 3.0, 4.0, 5.0, 6.0, 5.0, 4.0, 3.0, 2.5]; + +void main() => runApp(const App()); + +class App extends StatefulWidget { + const App({super.key}); + @override + State createState() => _AppState(); +} + +class _Tile { + // SHARED named profile → all tiles route to ONE cef_host (like Campus's 'campus-web'), + // serialized by the create-pacer — NOT a host-per-controller. This is the regime Campus + // actually runs and the probes had been missing. + final CefWebController controller = CefWebController(profile: 'soak-shared'); + double scale = 2.0; + int scaleIdx = 0; +} + +class _AppState extends State { + final _tiles = List.generate(4, (_) => _Tile()); + Timer? _timer; + int _tick = 0; + bool _running = true; + + @override + void initState() { + super.initState(); + // Tile 0 loads trivial HTML; the rest navigate to their real URL on first establishment. + _tiles[0].controller.onPageStarted = (_) => _tiles[0].controller.loadHtmlString(_trivial); + // Hammer renderScale FAST (200ms) across all tiles — faster than a heavy page re-rasters. + Future.delayed(const Duration(seconds: 6), () { + _timer = Timer.periodic(const Duration(milliseconds: 200), (_) => _step()); + }); + } + + void _step() { + if (!mounted || !_running) return; + final t = _tiles[_tick % _tiles.length]; + setState(() { + t.scaleIdx = (t.scaleIdx + 1) % _scales.length; + t.scale = _scales[t.scaleIdx]; + _tick++; + }); + } + + @override + void dispose() { + _timer?.cancel(); + for (final t in _tiles) { + t.controller.dispose(); + } + super.dispose(); + } + + @override + Widget build(BuildContext context) { + return MaterialApp( + debugShowCheckedModeBanner: false, + home: Scaffold( + backgroundColor: const Color(0xFF111722), + body: Column(children: [ + Container( + width: double.infinity, + color: const Color(0xFF0B1220), + padding: const EdgeInsets.symmetric(horizontal: 12, vertical: 8), + child: Row(children: [ + Expanded( + child: Text('REAL-SITE SOAK (rapid zoom on heavy pages) tick=$_tick', + style: const TextStyle(color: Colors.white, fontSize: 13)), + ), + TextButton( + onPressed: () => setState(() => _running = !_running), + child: Text(_running ? 'pause' : 'resume', + style: const TextStyle(color: Colors.white)), + ), + ]), + ), + Expanded( + child: GridView.count( + crossAxisCount: 2, + padding: const EdgeInsets.all(12), + mainAxisSpacing: 12, + crossAxisSpacing: 12, + children: [for (var i = 0; i < _tiles.length; i++) _tileView(i)], + ), + ), + ]), + ), + ); + } + + Widget _tileView(int i) { + final t = _tiles[i]; + return Container( + color: const Color(0xFF2A3340), + alignment: Alignment.center, + child: Stack(alignment: Alignment.topLeft, children: [ + SizedBox( + width: 320, + height: 220, + child: CefWebView( + url: _urls[i] ?? 'about:blank', + controller: t.controller, + renderScale: t.scale, + ), + ), + Container( + color: Colors.black54, + padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 1), + child: Text('T$i ${_urls[i] == null ? "trivial" : Uri.parse(_urls[i]!).host} s=${t.scale.toStringAsFixed(0)}', + style: const TextStyle(color: Colors.white, fontSize: 9)), + ), + ]), + ); + } +} diff --git a/example/lib/recreate_soak_probe.dart b/example/lib/recreate_soak_probe.dart new file mode 100644 index 0000000..936808d --- /dev/null +++ b/example/lib/recreate_soak_probe.dart @@ -0,0 +1,195 @@ +// Recreate SOAK probe — mimics Campus's CefSessionController.recover() (the one pattern +// the other probes never exercised): on a paint-stall / F-6 stall Campus DISPOSES the +// controller, builds a FRESH one, and REMOUNTS the CefWebView against it (a generation +// ValueKey bump). This probe drives that recreate cycle interleaved with zoom (renderScale) +// and cull (setVisible) — the suspected source of "looks fine, then after interaction +// degrades to blank/freeze/wrong-size, never recovers". +// +// Each tile, on a schedule: change renderScale, hide, show, or RECREATE (dispose+new+remount). +// A tile that wedges after a recreate shows blank / frozen clock / wrong scale and stays. +// FLUTTER_CEF_DEBUG=1 → [cefdiag-resize] shows whether the recreated controller's first +// correct-scale frame ever promotes. +// +// FLUTTER_CEF_HOST= FLUTTER_CEF_ALLOW_INSECURE_PROFILE=1 FLUTTER_CEF_DEBUG=1 \ +// flutter run -d macos -t lib/recreate_soak_probe.dart +import 'dart:async'; +import 'package:flutter/material.dart'; +import 'package:flutter_cef/flutter_cef.dart'; + +String _html(int i, int gen) => ''' + +
+
TILE $i · gen $gen
+
+
+ +'''; + +const _scales = [2.0, 3.0, 4.0, 5.0, 6.0, 4.0]; + +void main() => runApp(const App()); + +class App extends StatefulWidget { + const App({super.key}); + @override + State createState() => _AppState(); +} + +class _Tile { + CefWebController controller = CefWebController(); + int gen = 0; + double scale = 2.0; + int scaleIdx = 0; + bool visible = true; + int recreates = 0; +} + +class _AppState extends State { + final _tiles = List.generate(2, (_) => _Tile()); + Timer? _timer; + int _tick = 0; + bool _running = true; + + @override + void initState() { + super.initState(); + for (var i = 0; i < _tiles.length; i++) { + _wire(i); + } + Future.delayed(const Duration(seconds: 5), () { + _timer = Timer.periodic(const Duration(milliseconds: 400), (_) => _step()); + }); + } + + void _wire(int i) { + final t = _tiles[i]; + t.controller.onPageStarted = (_) => t.controller.loadHtmlString(_html(i, t.gen)); + } + + // Mimic CefSessionController.recover(): build a fresh controller, bump the generation + // (remount key), dispose the old. The CefWebView below is keyed on gen, so it remounts + // and re-create()s against the new controller — exactly Campus's recover path. + void _recreate(int i) { + final t = _tiles[i]; + final old = t.controller; + t.controller = CefWebController(); + t.gen++; + t.recreates++; + _wire(i); + // ignore: discarded_futures + old.dispose(); + } + + void _step() { + if (!mounted || !_running) return; + final t = _tiles[_tick % _tiles.length]; + final action = (_tick ~/ _tiles.length) % 5; + setState(() { + switch (action) { + case 0: + t.scaleIdx = (t.scaleIdx + 1) % _scales.length; + t.scale = _scales[t.scaleIdx]; + case 1: + t.visible = false; + t.controller.setVisible(false); + case 2: + t.visible = true; + t.controller.setVisible(true); + case 3: + _recreate(t == _tiles[0] ? 0 : 1); // RECREATE — the suspect path + case 4: + // recreate WHILE at a non-default scale (the recreate-during-zoom interaction) + t.scaleIdx = (t.scaleIdx + 2) % _scales.length; + t.scale = _scales[t.scaleIdx]; + _recreate(t == _tiles[0] ? 0 : 1); + } + _tick++; + }); + } + + @override + void dispose() { + _timer?.cancel(); + for (final t in _tiles) { + t.controller.dispose(); + } + super.dispose(); + } + + @override + Widget build(BuildContext context) { + return MaterialApp( + debugShowCheckedModeBanner: false, + home: Scaffold( + backgroundColor: const Color(0xFF111722), + body: Column( + children: [ + Container( + width: double.infinity, + color: const Color(0xFF0B1220), + padding: const EdgeInsets.symmetric(horizontal: 12, vertical: 8), + child: Row(children: [ + Expanded( + child: Text( + 'RECREATE SOAK tick=$_tick ' + 'recreates=${_tiles.map((t) => t.recreates).join(",")}', + style: const TextStyle(color: Colors.white, fontSize: 13), + ), + ), + TextButton( + onPressed: () => setState(() => _running = !_running), + child: Text(_running ? 'pause' : 'resume', + style: const TextStyle(color: Colors.white)), + ), + ]), + ), + Expanded( + child: Row( + children: [ + for (var i = 0; i < _tiles.length; i++) Expanded(child: _tileView(i)), + ], + ), + ), + ], + ), + ), + ); + } + + Widget _tileView(int i) { + final t = _tiles[i]; + return Container( + color: const Color(0xFF2A3340), + alignment: Alignment.center, + child: Stack( + alignment: Alignment.topLeft, + children: [ + SizedBox( + width: 320, + height: 240, + // Keyed on gen → remounts against the fresh controller on recreate (Campus's + // generation-keyed body). + child: CefWebView( + key: ValueKey('tile$i-gen${t.gen}'), + url: 'about:blank', + controller: t.controller, + renderScale: t.scale, + ), + ), + Container( + color: Colors.black54, + padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 1), + child: Text( + 'T$i gen${t.gen} s=${t.scale.toStringAsFixed(0)} ${t.visible ? "vis" : "HID"}', + style: const TextStyle(color: Colors.white, fontSize: 9), + ), + ), + ], + ), + ); + } +} diff --git a/example/lib/sharedhost_html_probe.dart b/example/lib/sharedhost_html_probe.dart new file mode 100644 index 0000000..c6d305b --- /dev/null +++ b/example/lib/sharedhost_html_probe.dart @@ -0,0 +1,118 @@ +// Shared-host loadHtmlString probe — reproduces Campus's agent_ui scenario where, on a fresh +// launch, only ~2 of 6 agent_ui tiles paint and the rest stay BLANK. agent_ui tiles all share +// ONE named-profile cef_host and load their UI via onPageStarted -> loadHtmlString (NOT a URL). +// This mounts 6 such tiles at once (the queued-create burst on the shared host) and labels each +// with its index + a clock, so a blank tile is obvious. Run via `flutter run` so cef_host's +// stdout (FIRSTPAINT / kOpAddChannel / errors) is captured — the data the open-launched Campus +// app hides. +// +// FLUTTER_CEF_HOST= FLUTTER_CEF_ALLOW_INSECURE_PROFILE=1 FLUTTER_CEF_DEBUG=1 \ +// flutter run -d macos -t lib/sharedhost_html_probe.dart +import 'dart:io'; +import 'package:flutter/material.dart'; +import 'package:flutter_cef/flutter_cef.dart'; + +// Tile count configurable via PROBE_N (default 6) to bracket the per-host browser limit. +final int _probeN = int.tryParse(Platform.environment['PROBE_N'] ?? '') ?? 6; + +String _html(int i) => ''' + +
TILE $i (html)
+
+ +'''; + +void main() => runApp(const App()); + +class App extends StatefulWidget { + const App({super.key}); + @override + State createState() => _AppState(); +} + +class _AppState extends State { + // 6 controllers, ALL on one shared named profile (like agent_ui's 'agent-ui-cef'), each + // loading its UI via onPageStarted -> loadHtmlString — the exact agent_ui pattern. + late final List _controllers; + + @override + void initState() { + super.initState(); + _controllers = List.generate(_probeN, (i) { + final c = CefWebController(profile: 'agent-ui-test'); + c.onPageStarted = (url) { + // Only inject on the initial about:blank — loadHtmlString navigates to a data: URL + // which re-fires onPageStarted, so an unconditional load is an infinite reload loop. + if (url == 'about:blank') { + // ignore: avoid_print + print('PROBE loadHtmlString slot=$i'); + c.loadHtmlString(_html(i)); + } + }; + return c; + }); + } + + @override + void dispose() { + for (final c in _controllers) { + c.dispose(); + } + super.dispose(); + } + + @override + Widget build(BuildContext context) { + return MaterialApp( + debugShowCheckedModeBanner: false, + home: Scaffold( + backgroundColor: const Color(0xFF111722), + body: Padding( + padding: const EdgeInsets.all(12), + child: GridView.count( + crossAxisCount: 3, + mainAxisSpacing: 12, + crossAxisSpacing: 12, + children: [ + for (var i = 0; i < _controllers.length; i++) + Container( + color: const Color(0xFF2A3340), + // No alignment + tight cell constraints -> the Container fills the cell, the + // Stack inherits tight constraints and fills, so Positioned.fill gives the + // CefWebView the FULL cell size (the earlier shrink-wrap made it ~50px). + child: Stack( + children: [ + Positioned.fill( + child: CefWebView( + url: 'about:blank', + controller: _controllers[i], + renderScale: 2.0, + ), + ), + Positioned( + top: 0, + left: 0, + child: Container( + color: Colors.black54, + padding: const EdgeInsets.symmetric( + horizontal: 4, vertical: 1), + child: Text('slot $i', + style: const TextStyle( + color: Colors.white, fontSize: 9)), + ), + ), + ], + ), + ), + ], + ), + ), + ), + ); + } +} diff --git a/example/macos/Podfile.lock b/example/macos/Podfile.lock index 04078c0..9df0e63 100644 --- a/example/macos/Podfile.lock +++ b/example/macos/Podfile.lock @@ -1,5 +1,5 @@ PODS: - - flutter_cef_macos (0.1.3): + - flutter_cef_macos (0.2.0): - FlutterMacOS - FlutterMacOS (1.0.0) @@ -14,7 +14,7 @@ EXTERNAL SOURCES: :path: Flutter/ephemeral SPEC CHECKSUMS: - flutter_cef_macos: 326855d498418476cd4a2f1e47bf72997ec4bab3 + flutter_cef_macos: f4ec14a9d75c0a198b6c8ba620ec4e56d4ad22f0 FlutterMacOS: d0db08ddef1a9af05a5ec4b724367152bb0500b1 PODFILE CHECKSUM: 54d867c82ac51cbd61b565781b9fada492027009 diff --git a/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift b/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift index 0fae4c8..c8e9228 100644 --- a/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift +++ b/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift @@ -922,6 +922,12 @@ final class CefProfileHost { createSendQueue.removeAll() createInFlight.removeAll() writeLock.unlock() + // Also abandon pre-opReady queued creates (pendingCreates is browsersLock-guarded, not + // writeLock) so a host dying between spawn and opReady tears down all THREE create-state + // queues symmetrically — the old asymmetry left these closures dangling. + browsersLock.lock() + pendingCreates.removeAll() + browsersLock.unlock() // CEF-2a/b: drop ALL relays (each a listener + any client) before tearing down // the pipe, so none keeps bridging into a closing fd. Snapshot under the lock, // clear the dict + onCdpMessage, then stop each OUTSIDE the lock (stop() may @@ -1210,6 +1216,12 @@ final class CefProfileHost { spawnedPid = 0 let died = onHostDied writeLock.unlock() + // Abandon pre-opReady queued creates too (pendingCreates is browsersLock-guarded) — + // symmetric with the createSendQueue/createInFlight teardown above; the onHostDied path + // still emits processGone for the sessions left in `browsers`. + browsersLock.lock() + pendingCreates.removeAll() + browsersLock.unlock() // The host is gone: tear down CDP relays (free their localhost listeners + // clients) and FAIL any in-flight targetId waiters so enableAgentControl // callers don't hang forever. Mirrors shutdown()'s teardown — snapshot under diff --git a/packages/flutter_cef_macos/native/cef_host/main.mm b/packages/flutter_cef_macos/native/cef_host/main.mm index 515223b..ef7301f 100644 --- a/packages/flutter_cef_macos/native/cef_host/main.mm +++ b/packages/flutter_cef_macos/native/cef_host/main.mm @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -719,6 +720,39 @@ void CompositeMetalLocked(IOSurfaceRef view_src) { // scope since `sw/sh` below are inside the @autoreleasepool. const int srcW = view_src ? static_cast(IOSurfaceGetWidth(view_src)) : 0; const int srcH = view_src ? static_cast(IOSurfaceGetHeight(view_src)) : 0; + // DIAG (screen-independent verification — the box may be display-asleep, so a screenshot + // can't tell content from blank): sample the renderer's composited frame on a throttle and + // classify a 9-point grid. content>0 means real pixels landed; white==9 means only the + // opaque background (page didn't paint content); clear==9 means a zero-filled / never- + // committed frame (the shared-GPU multiplex failure). Under FLUTTER_CEF_DEBUG only. + if (view_src && (slot_->diag_paint_count % 60) == 2 && + std::getenv("FLUTTER_CEF_DEBUG") && + IOSurfaceLock(view_src, kIOSurfaceLockReadOnly, nullptr) == kIOReturnSuccess) { + const auto* base = static_cast(IOSurfaceGetBaseAddress(view_src)); + const size_t bpr = IOSurfaceGetBytesPerRow(view_src); + int content = 0, white = 0, clear = 0; + uint32_t center = 0; + const int xs[3] = {srcW / 4, srcW / 2, (3 * srcW) / 4}; + const int ys[3] = {srcH / 4, srcH / 2, (3 * srcH) / 4}; + for (int yi = 0; yi < 3; ++yi) + for (int xi = 0; xi < 3; ++xi) { + const uint8_t* p = base + static_cast(ys[yi]) * bpr + + static_cast(xs[xi]) * 4; // BGRA8 + const uint8_t b = p[0], g = p[1], r = p[2], a = p[3]; + if (xi == 1 && yi == 1) + center = (uint32_t)b | ((uint32_t)g << 8) | ((uint32_t)r << 16) | ((uint32_t)a << 24); + if (a == 0) clear++; + else if (r > 240 && g > 240 && b > 240) white++; + else if (r < 12 && g < 12 && b < 12) clear++; // black == empty too + else content++; + } + IOSurfaceUnlock(view_src, kIOSurfaceLockReadOnly, nullptr); + char buf[160]; + snprintf(buf, sizeof(buf), + "diagpx wire=%u %dx%d content=%d white=%d clear=%d center=0x%08x", + slot_->browser_id, srcW, srcH, content, white, clear, center); + SendLog(slot_->browser_id, buf); + } if (view_src && EnsureMetal()) { @autoreleasepool { const int sw = static_cast(IOSurfaceGetWidth(view_src)); @@ -1007,19 +1041,32 @@ void OnLoadingStateChange(CefRefPtr, bool isLoading, void OnLoadStart(CefRefPtr, CefRefPtr frame, TransitionType) override { if (!frame) return; - if (frame->IsMain()) + if (frame->IsMain()) { SendUtf8(slot_->browser_id, kOpPageStart, frame->GetURL().ToString()); - // (Re)install JS-channel shims for this freshly-loaded frame. - for (const auto& name : g_channels) InjectChannelShim(frame, name); + // SECURITY: install the JS-channel shims ONLY into the MAIN frame. The shims expose the + // privileged campusHost bridge (window. -> window.cefQuery 'ch:'); injecting them + // into cross-origin SUBFRAMES would hand an untrusted embedded iframe that bridge. (The + // previous code injected into every frame.) OnQuery also refuses subframe 'ch:'/'eval:'. + for (const auto& name : g_channels) InjectChannelShim(frame, name); + } } void OnLoadEnd(CefRefPtr browser, CefRefPtr frame, int /*httpStatusCode*/) override { if (frame && frame->IsMain()) { SendUtf8(slot_->browser_id, kOpPageFinish, frame->GetURL().ToString()); - // C1: force a repaint when the main frame finishes — a first paint dropped - // during load (e.g. a GPU surface not yet ready) self-heals here instead of - // leaving a permanently blank texture with no signal. - if (browser && browser->GetHost()) browser->GetHost()->Invalidate(PET_VIEW); + // C1 + RENDER FLOOR: force a repaint when the main frame finishes. Invalidate(PET_VIEW) + // ALONE is coalesce-able — the scheduler can drop it, which on a shared GPU/Viz process + // under a multi-browser establishment burst is exactly when the real-content first frame + // gets lost, leaving a permanently blank tile though the page loaded. Mirror the proven + // DoSetVisible visibility-edge kick: re-assert size + damage + a NON-coalesce-able + // SendExternalBeginFrame, which deterministically drives one renderer frame the scheduler + // cannot swallow. (slot_->visible gate: a hidden tile must stay paused — F-2.) + if (browser && browser->GetHost() && slot_->visible) { + auto h = browser->GetHost(); + h->WasResized(); + h->Invalidate(PET_VIEW); + h->SendExternalBeginFrame(); + } } } void OnLoadError(CefRefPtr, CefRefPtr, ErrorCode code, @@ -1118,16 +1165,23 @@ bool OnCursorChange(CefRefPtr, CefCursorHandle, // window.cefQuery; queries land here. We forward the request string to the // host: "eval::" for a runJavaScriptReturningResult result, // "ch::" for a JS-channel post. - bool OnQuery(CefRefPtr, CefRefPtr, int64_t, + bool OnQuery(CefRefPtr, CefRefPtr frame, int64_t, const CefString& request, bool, CefRefPtr callback) override { std::string r = request.ToString(); + // SECURITY: 'eval:' (host-eval result channel) and 'ch:' (campusHost bridge) are + // PRIVILEGED — they reach the trusted host eval/result path and the agent_ui reducer. The + // shim is injected per-frame, so a cross-origin / untrusted IFRAME could forge them. Honor + // them ONLY from the MAIN frame (the host-trusted document); refuse subframe queries. + const bool main_frame = !frame || frame->IsMain(); if (r.rfind("eval:", 0) == 0) { + if (!main_frame) { callback->Failure(403, "subframe"); return true; } SendUtf8(slot_->browser_id, kOpEvalResult, r.substr(5)); callback->Success(CefString()); return true; } if (r.rfind("ch:", 0) == 0) { + if (!main_frame) { callback->Failure(403, "subframe"); return true; } SendUtf8(slot_->browser_id, kOpChannelMsg, r.substr(3)); callback->Success(CefString()); return true; @@ -1331,6 +1385,19 @@ void OnContextInitialized() override { void DoCreateBrowser(uint32_t wire_id, int w, int h, double dpr, uint32_t sid, std::string url) { CEF_REQUIRE_UI_THREAD(); + // WIRE-ID REUSE GUARD: the Swift side allocates ids monotonically, so a collision should be + // impossible — but if one ever happened, registering the new slot would let the OLD browser's + // OnBeforeClose later erase the NEW slot (g_slots_by_wire_id.erase(id)), leaving an unroutable + // browser + a leaked IOSurface/dst_mtl. Fail loudly + tell the host (kOpCreateFailed advances + // its pacer / drops the session) instead of silently corrupting cross-tile routing. + { + std::lock_guard lock(g_slots_mutex); + if (g_slots_by_wire_id.count(wire_id)) { + SendLog(wire_id, "createBrowser: wire id already in use — refusing (id-reuse bug)"); + SendFrame(wire_id, kOpCreateFailed, nullptr, 0); + return; + } + } auto slot = std::make_shared(); slot->browser_id = wire_id; slot->width = w < 1 ? 1 : w; @@ -1371,6 +1438,13 @@ void DoCreateBrowser(uint32_t wire_id, int w, int h, double dpr, uint32_t sid, window_info.external_begin_frame_enabled = true; CefBrowserSettings settings; settings.windowless_frame_rate = 60; + // RENDER FLOOR: paint an OPAQUE background. With the default (alpha 0) a windowless + // browser paints transparent, so a DROPPED renderer frame — the shared-GPU multiplex + // failure where the 2nd+ browser's CompositorFrame never lands — is INVISIBLE (the canvas + // shows through) and indistinguishable from "loading". Opaque means a missing frame reads + // as a blank white tile (correct-looking for a not-yet-painted page) instead of a ghost, + // and makes the failure diagnosable. Pages with their own bg paint over this normally. + settings.background_color = CefColorSetARGB(255, 255, 255, 255); // about:blank-first: for a real http(s) URL, establish on about:blank (near-instant // first frame → the pacer's establishment slot frees fast) and defer the real // navigation to first paint. Skip for data:/file:/about: (already instant) and when the @@ -1488,7 +1562,15 @@ void DoResize(const std::shared_ptr& slot, int w, int h, } void DoNavigate(const std::shared_ptr& slot, const std::string& url) { - if (!slot->browser) return; + if (!slot->browser) { + // The slot exists but the browser is not yet BOUND (OnAfterCreated pending) — e.g. a + // loadHtmlString that arrived right behind a queued createBrowser in a shared-host burst + // (6 agent_ui tiles created at once). Defer instead of dropping: the first-paint handler + // applies pending_nav_url once the browser binds, and a trusted load keeps its armed + // exemption in trusted_pending. Dropping here is exactly why such a burst stayed blank. + slot->pending_nav_url = url; + return; + } CefRefPtr f = slot->browser->GetMainFrame(); if (f) f->LoadURL(url); } @@ -1506,6 +1588,22 @@ void DoNavigateTrusted(const std::shared_ptr& slot, DoNavigate(slot, url); } +// Navigate / loadTrusted resolved by wire id ON the UI thread, not the reader thread. On a +// shared host the createBrowser for this id is queued ahead of us on TID_UI (FIFO ordering), +// so LookupWireId is null on the reader thread but registered by the time this task runs — +// dropping the op on the reader thread (the old `if (!slot) break`) is why a burst of tiles +// that loadHtmlString right after create stayed blank. Mirrors the kOpAddChannel fix. With +// trusted=true the allowlist exemption is armed and a not-yet-bound browser is tolerated via +// pending_nav_url (DoNavigate above). +void DoNavigateByWireId(uint32_t wire_id, std::string url, bool trusted) { + auto slot = LookupWireId(wire_id); + if (!slot) return; // genuinely disposed before the nav landed + if (trusted) + DoNavigateTrusted(slot, url); + else + DoNavigate(slot, url); +} + void DoReload(const std::shared_ptr& slot) { if (slot->browser) slot->browser->Reload(); } @@ -1965,15 +2063,20 @@ void IpcReadLoop() { break; } case kOpNavigate: { - if (!slot) break; + // Resolve by wire id on TID_UI (see DoNavigateByWireId): do NOT require the slot + // here, or a nav landing behind a still-queued create on a shared host is dropped. std::string url(reinterpret_cast(p), plen); - CefPostTask(TID_UI, base::BindOnce(&DoNavigate, slot, url)); + CefPostTask(TID_UI, + base::BindOnce(&DoNavigateByWireId, wire_id, url, false)); break; } case kOpLoadTrusted: { - if (!slot) break; + // Same: a loadHtmlString right behind a queued create (a 6-tile agent_ui burst) + // must not be dropped — that was the blank-tile bug. Resolved on TID_UI, FIFO-after + // the create, and tolerant of a not-yet-bound browser via pending_nav_url. std::string url(reinterpret_cast(p), plen); - CefPostTask(TID_UI, base::BindOnce(&DoNavigateTrusted, slot, url)); + CefPostTask(TID_UI, + base::BindOnce(&DoNavigateByWireId, wire_id, url, true)); break; } case kOpReload: @@ -2219,6 +2322,22 @@ int ConnectUnixSocket(const std::string& path) { } // namespace int main(int argc, char* argv[]) { + // Raise the open-file limit early. A busy shared host runs many OSR browsers, each holding + // sockets/pipes plus IOSurfaces, against macOS's low default soft limit (256) — and an + // fd-heavy campus reaches the documented non-fatal WebRTC select() fd>=1024 fault. Lift the + // soft limit toward the hard limit so fd headroom isn't the reachable ceiling. + { + struct rlimit rl; + if (getrlimit(RLIMIT_NOFILE, &rl) == 0) { + const rlim_t kCap = 10240; // macOS caps RLIMIT_NOFILE at OPEN_MAX (10240) + rlim_t target = + (rl.rlim_max == RLIM_INFINITY || rl.rlim_max > kCap) ? kCap : rl.rlim_max; + if (rl.rlim_cur < target) { + rl.rlim_cur = target; + setrlimit(RLIMIT_NOFILE, &rl); + } + } + } #if defined(CEF_HOST_MULTIPROCESS) && defined(CEF_HOST_ADHOC) // Disable Chromium 144's Mach-port peer-requirement validation for the whole // process tree. The child processes read this policy from an env var (NOT the From f6b661075ebcf779b9dbb5424da020dba46fbd9e Mon Sep 17 00:00:00 2001 From: wenkaifan0720 Date: Sat, 27 Jun 2026 07:18:53 -0700 Subject: [PATCH 07/13] test(osr): conformance harness (the oracle) + resize supersede + want-dims pixel oracle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The production-grade instrument the whack-a-mole was missing: a SCREEN-INDEPENDENT conformance harness that reproduces Campus's full CEF workload outside Campus and asserts the two invariants that matter — never BLANK, never WRONG-SIZE — headlessly (the dev box is often display-asleep). - example/lib/conformance_harness.dart: N tiles on one shared profile, auto-cycles idle→resize→zoom→cull→recreate→combo storms (HARNESS_HARD=1 = static pages + zoom→6 + combo resize-while-zoomed, the regime that wedges). Each tile paints a KNOWN center color. - cef_host [cefdiag] diagpx: now logs painted=WxH want=WxH (want = logical×dpr) + a 9-point content/white/clear classification + center color. painted< grace). A later step can also read served dims via +// a controller callback; for now the native oracle is authoritative and headless. +// +// PHASES (auto-cycled; also drivable by the buttons): idle → resize-storm (animate each tile's +// box at ~60Hz) → zoom-storm (ramp renderScale across quantization thresholds) → cull-storm +// (hide/show) → recreate-storm (dispose+rebuild). One shared profile = one cef_host, like Campus. +// +// FLUTTER_CEF_HOST= FLUTTER_CEF_ALLOW_INSECURE_PROFILE=1 FLUTTER_CEF_DEBUG=1 \ +// flutter run -d macos -t lib/conformance_harness.dart +import 'dart:async'; +import 'dart:io'; +import 'package:flutter/material.dart'; +import 'package:flutter_cef/flutter_cef.dart'; + +final int _tileCount = int.tryParse(Platform.environment['HARNESS_N'] ?? '') ?? 12; +// HARD mode: STATIC pages (no animation → exactly one frame per resize, the classic size-gate +// wedge), zoom up to 6 (huge surfaces), and a combined resize+zoom storm. This is the regime +// that actually reproduces Campus's blank/4x; the default mode is the gentle smoke test. +final bool _hard = (Platform.environment['HARNESS_HARD'] ?? '') == '1'; + +// Per-tile known center color (page bg). diagpx center==this ⇒ content present; bg/clear ⇒ blank. +const _colors = [ + 0xFFE53935, 0xFF8E24AA, 0xFF3949AB, 0xFF039BE5, 0xFF00897B, 0xFF7CB342, + 0xFFFDD835, 0xFFFB8C00, 0xFFD81B60, 0xFF5E35B1, 0xFF00ACC1, 0xFF43A047, +]; +int _colorOf(int i) => _colors[i % _colors.length]; +String _hex(int argb) => '0x${argb.toRadixString(16).padLeft(8, '0')}'; + +String _html(int i) { + final c = _colorOf(i); + final css = '#${(c & 0xFFFFFF).toRadixString(16).padLeft(6, '0')}'; + // HARD: a STATIC page (no setInterval) paints exactly ONE frame per resize — if that frame's + // dims don't exactly match the size-gate's expectation, it wedges forever (the classic bug). + // Default: a ticking clock (animating) so the pump keeps producing frames. + final ticker = _hard + ? '' + : ""; + return ''' + +
TILE $i
static
+ $ticker +'''; +} + +enum Phase { idle, resizeStorm, zoomStorm, cullStorm, recreateStorm, comboStorm } + +final _zoomScales = _hard + ? const [2.0, 3.0, 4.0, 5.0, 6.0, 5.0, 4.0, 3.0] // up to 6 → huge surfaces + : const [2.0, 2.5, 3.0, 4.0, 5.0, 4.0, 3.0, 2.5]; + +void main() => runApp(const HarnessApp()); + +class HarnessApp extends StatefulWidget { + const HarnessApp({super.key}); + @override + State createState() => _HarnessAppState(); +} + +class _Tile { + CefWebController controller; + int gen = 0; + double scale = 2.0; + int scaleIdx = 0; + bool visible = true; + double sizeT = 0; // 0..1 animation param for the resize storm + _Tile(this.controller); +} + +class _HarnessAppState extends State { + late List<_Tile> _tiles; + Phase _phase = Phase.idle; + Timer? _driver; + int _frame = 0; + bool _auto = true; + + @override + void initState() { + super.initState(); + _tiles = List.generate(_tileCount, (i) => _Tile(_mkController(i))); + // 60Hz driver for the storms. + _driver = Timer.periodic(const Duration(milliseconds: 16), (_) => _tick()); + // Auto-cycle phases every 6s so an unattended run exercises everything. + Future.delayed(const Duration(seconds: 5), _cyclePhases); + } + + CefWebController _mkController(int i) { + final c = CefWebController(profile: 'conf-harness'); + c.onPageStarted = (url) { + if (url == 'about:blank') c.loadHtmlString(_html(i)); + }; + return c; + } + + void _cyclePhases() { + if (!mounted || !_auto) return; + const order = [ + Phase.idle, Phase.resizeStorm, Phase.zoomStorm, Phase.cullStorm, + Phase.recreateStorm, Phase.comboStorm, Phase.idle, + ]; + final next = order[(order.indexOf(_phase) + 1) % order.length]; + _setPhase(next); + Future.delayed(const Duration(seconds: 6), _cyclePhases); + } + + void _setPhase(Phase p) { + setState(() => _phase = p); + // ignore: avoid_print + print('[HARNESS] === PHASE ${p.name} ==='); + } + + void _tick() { + if (!mounted) return; + _frame++; + switch (_phase) { + case Phase.resizeStorm: + // Animate every tile's logical box continuously — the resize storm that wedged Campus. + setState(() { + for (final t in _tiles) { + t.sizeT = (t.sizeT + 0.02) % 1.0; + } + }); + break; + case Phase.zoomStorm: + // Step renderScale across quantization thresholds every ~300ms. + if (_frame % 18 == 0) { + setState(() { + for (final t in _tiles) { + t.scaleIdx = (t.scaleIdx + 1) % _zoomScales.length; + t.scale = _zoomScales[t.scaleIdx]; + _logGeom(t); + } + }); + } + break; + case Phase.cullStorm: + // Hide/show every ~500ms. + if (_frame % 30 == 0) { + setState(() { + for (final t in _tiles) { + t.visible = !t.visible; + t.controller.setVisible(t.visible); + } + }); + } + break; + case Phase.recreateStorm: + // Recreate ~2 tiles per second (staggered) — the recover() storm. + if (_frame % 30 == 0) { + final i = (_frame ~/ 30) % _tiles.length; + _recreate(i); + } + break; + case Phase.comboStorm: + // WORST CASE: animate the box (reallocates the surface) AND step renderScale (huge, + // re-rasters the whole page) together — a big surface realloc + full re-raster every + // few frames. This is the YouTube-zoomed-and-resized scenario that wedged Campus. + setState(() { + for (final t in _tiles) { + t.sizeT = (t.sizeT + 0.02) % 1.0; + } + if (_frame % 12 == 0) { + for (final t in _tiles) { + t.scaleIdx = (t.scaleIdx + 1) % _zoomScales.length; + t.scale = _zoomScales[t.scaleIdx]; + } + } + }); + break; + case Phase.idle: + break; + } + } + + void _recreate(int i) { + final t = _tiles[i]; + final old = t.controller; + setState(() { + t.controller = _mkController(i); + t.gen++; + t.visible = true; + }); + // ignore: avoid_print + print('[HARNESS] tile=$i RECREATE gen=${t.gen}'); + // ignore: discarded_futures + old.dispose(); + } + + void _logGeom(_Tile t) { + final i = _tiles.indexOf(t); + // ignore: avoid_print + print('[HARNESS] tile=$i phase=${_phase.name} scale=${t.scale} ' + 'center=${_hex(_colorOf(i))}'); + } + + @override + void dispose() { + _driver?.cancel(); + for (final t in _tiles) { + t.controller.dispose(); + } + super.dispose(); + } + + @override + Widget build(BuildContext context) { + final cols = (_tileCount <= 4) ? 2 : (_tileCount <= 9 ? 3 : 4); + return MaterialApp( + debugShowCheckedModeBanner: false, + home: Scaffold( + backgroundColor: const Color(0xFF0B1220), + body: Column(children: [ + Container( + color: const Color(0xFF111827), + padding: const EdgeInsets.symmetric(horizontal: 12, vertical: 8), + child: Row(children: [ + Expanded( + child: Text('CONFORMANCE phase=${_phase.name} frame=$_frame ' + 'tiles=$_tileCount', + style: const TextStyle(color: Colors.white, fontSize: 13)), + ), + for (final p in Phase.values) + Padding( + padding: const EdgeInsets.only(left: 6), + child: TextButton( + onPressed: () { + _auto = false; + _setPhase(p); + }, + child: Text(p.name, + style: TextStyle( + color: _phase == p ? Colors.amber : Colors.white70, + fontSize: 11)), + ), + ), + ]), + ), + Expanded( + child: GridView.count( + crossAxisCount: cols, + padding: const EdgeInsets.all(10), + mainAxisSpacing: 10, + crossAxisSpacing: 10, + children: [for (var i = 0; i < _tileCount; i++) _cell(i)], + ), + ), + ]), + ), + ); + } + + Widget _cell(int i) { + final t = _tiles[i]; + // Resize storm animates the inner box between 55% and 100% of the cell. + final animating = _phase == Phase.resizeStorm || _phase == Phase.comboStorm; + final f = animating ? (0.55 + 0.45 * (0.5 - (t.sizeT - 0.5).abs()) * 2) : 1.0; + return Container( + color: const Color(0xFF1F2937), + alignment: Alignment.center, + child: FractionallySizedBox( + widthFactor: f.clamp(0.4, 1.0), + heightFactor: f.clamp(0.4, 1.0), + child: Stack(children: [ + Positioned.fill( + child: CefWebView( + key: ValueKey('tile$i-gen${t.gen}'), + url: 'about:blank', + controller: t.controller, + renderScale: t.scale, + ), + ), + Positioned( + top: 0, + left: 0, + child: Container( + color: Colors.black54, + padding: const EdgeInsets.symmetric(horizontal: 3), + child: Text('$i ${t.visible ? "" : "HID"}', + style: const TextStyle(color: Colors.white, fontSize: 9)), + ), + ), + ]), + ), + ); + } +} diff --git a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift index 2e2dcde..c8837ac 100644 --- a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift +++ b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift @@ -237,10 +237,29 @@ final class CefWebSession: NSObject, FlutterTexture { pendingRequestedW = w pendingRequestedH = h pendingRequestedDpr = d - let blocked = resizeInFlight + var blocked = resizeInFlight // A dpr change (canvas-zoom crispness) needs a reallocation just like a size change. let same = (w == width && h == height && d == dpr) + // SUPERSEDE A WEDGED RESIZE: the resizeWatchdog no longer force-promotes a wrong-scale + // surface, so if a resize's size-matched present never lands (a GPU/establishment wedge), + // resizeInFlight would stay true FOREVER — and the `blocked` guard below would then drop + // EVERY later resize. The surface freezes at the old size while the tile keeps growing, so + // the old (small) surface is scaled up into the bigger tile → wrong-scale + clipped (the + // "4x" symptom). If the in-flight resize has been stuck past a grace window, abandon its + // pending surface and let this newer size go out instead of blocking on it forever. + let wedged = resizeInFlight && (nowNs() &- resizeSentAtNs) > 450_000_000 // 450ms grace + if wedged { + pendingBuffer = nil + pendingSurfaceId = 0 + resizeInFlight = false + blocked = false + } + let curW = width, curH = height, curD = dpr bufferLock.unlock() + if ProcessInfo.processInfo.environment["FLUTTER_CEF_DEBUG"] != nil { + NSLog("[cefdiag-rsz] bid=\(browserId) req=\(w)x\(h)@\(d) cur=\(curW)x\(curH)@\(curD) " + + "blocked=\(blocked) same=\(same) wedged=\(wedged)") + } // While a resize is still painting, just record the latest size (above). Its present sends // the next one (maybeSendNextResize); if cef_host drops that paint, the resizeWatchdog // re-kicks it. This one-in-flight pacing keeps the page reflowing at cef_host's actual rate diff --git a/packages/flutter_cef_macos/native/cef_host/main.mm b/packages/flutter_cef_macos/native/cef_host/main.mm index ef7301f..8570cd2 100644 --- a/packages/flutter_cef_macos/native/cef_host/main.mm +++ b/packages/flutter_cef_macos/native/cef_host/main.mm @@ -725,7 +725,12 @@ void CompositeMetalLocked(IOSurfaceRef view_src) { // classify a 9-point grid. content>0 means real pixels landed; white==9 means only the // opaque background (page didn't paint content); clear==9 means a zero-filled / never- // committed frame (the shared-GPU multiplex failure). Under FLUTTER_CEF_DEBUG only. - if (view_src && (slot_->diag_paint_count % 60) == 2 && + static const int kDiagEvery = []() { + const char* e = std::getenv("FLUTTER_CEF_DIAGPX_EVERY"); + int n = e ? atoi(e) : 60; + return n > 0 ? n : 60; // sample 1-in-N accelerated paints (default 60 ≈ 1/s; set 6 ≈ 10/s) + }(); + if (view_src && (slot_->diag_paint_count % kDiagEvery) == 2 && std::getenv("FLUTTER_CEF_DEBUG") && IOSurfaceLock(view_src, kIOSurfaceLockReadOnly, nullptr) == kIOReturnSuccess) { const auto* base = static_cast(IOSurfaceGetBaseAddress(view_src)); @@ -747,10 +752,16 @@ void CompositeMetalLocked(IOSurfaceRef view_src) { else content++; } IOSurfaceUnlock(view_src, kIOSurfaceLockReadOnly, nullptr); - char buf[160]; + // want = the requested OSR surface dims (logical × dpr). Comparing painted (srcWxsrcH) + // against want is the WRONG-SIZE oracle: painted << want past a grace = the small-surface- + // scaled-up "4x" bug. content==0 with want>0 = BLANK. Both are screen-independent. + const int wantW = static_cast(slot_->width * slot_->dpr + 0.5); + const int wantH = static_cast(slot_->height * slot_->dpr + 0.5); + char buf[200]; snprintf(buf, sizeof(buf), - "diagpx wire=%u %dx%d content=%d white=%d clear=%d center=0x%08x", - slot_->browser_id, srcW, srcH, content, white, clear, center); + "diagpx wire=%u painted=%dx%d want=%dx%d content=%d white=%d clear=%d " + "center=0x%08x", + slot_->browser_id, srcW, srcH, wantW, wantH, content, white, clear, center); SendLog(slot_->browser_id, buf); } if (view_src && EnsureMetal()) { From 327bc466cbc81303e18fe6f1c17674529c4e6d12 Mon Sep 17 00:00:00 2001 From: wenkaifan0720 Date: Sat, 27 Jun 2026 08:10:47 -0700 Subject: [PATCH 08/13] =?UTF-8?q?fix(osr):=20always-latest=20promotion=20?= =?UTF-8?q?=E2=80=94=20drop=20the=20size-gate=20that=20froze=20static=20ti?= =?UTF-8?q?les=20stretched?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The size-gated promotion (promote the resized surface only when painted dims == round(logical× dpr) ±1) wedged STATIC pages: a static tile (e.g. a counter) paints exactly one frame per resize, and if that frame's dims were off by rounding — or the exact-match frame simply never re-arrived — the gate kept serving the OLD (small) surface forever. Result: stretch the tile wide → it shows the original-size image scaled up and FROZEN, while the page underneath is live (input/cursor still work). Reported from Campus on the Shared Counter tile. Fix = the unified model's core (prior art unanimous: webview_cef / cefclient / Ultralight / video swapchains): promote the pending surface as soon as cef_host paints INTO it (sid match), WITHOUT gating on exact composited dims. The surface is already the correct PHYSICAL size (we allocated it at logical×dpr); Flutter's Texture scales the content to the logical tile box, so a frame whose page-raster momentarily lags the new dpr shows briefly SOFT — never wrong-size, never frozen. Newest sid wins; convergence to crisp is driven by the begin-frame pump + the watchdog's opInvalidate re-kick (which now reliably promotes on the resulting paint), not a gate. Verified in the conformance harness (HARD: static pages, zoom→6, combo resize+zoom, recreate): all 18 live wires converge, 0 STUCK, 0 BLANK; worst transient = one dpr step (0.67), recovers. This is migration Step 2 (always-latest consumer transport). The resize supersede + per-resize watchdog remain as backstops; a later step can fold them into one convergence watchdog. --- .../macos/Classes/CefWebSession.swift | 68 +++++++++---------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift index c8837ac..8fe8338 100644 --- a/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift +++ b/packages/flutter_cef_macos/macos/Classes/CefWebSession.swift @@ -312,27 +312,23 @@ final class CefWebSession: NSObject, FlutterTexture { } /// Re-kick a wedged resize. Bails immediately if a newer resize has gone out (gen advanced) - /// or this one already promoted (not in flight). Otherwise the post-resize present never - /// matched — nudge cef_host to repaint the pending surface (opInvalidate), retrying every - /// ~80ms. After ~0.3s of failed re-kicks, FORCE-promote the pending surface: cef_host's - /// begin-frame pump has been painting into it the whole time, so it holds the correct new-size - /// content — a single dropped/mis-tagged present (the failure mode on a STATIC page like - /// flutter.dev, which produces exactly one frame per resize) can't leave the tile wedged. - /// Main-thread only, so sendFrame / textureFrameAvailable stay serialized. + /// or this one already promoted (not in flight). Otherwise cef_host hasn't yet painted into + /// the new (pending) surface — nudge it to repaint (opInvalidate), retrying every ~80ms. + /// Under ALWAYS-LATEST promotion the resulting paint into the pending surface promotes it on + /// sid match (handleFrame), so this re-kick is what rescues a STATIC page (one frame per + /// resize) whose single post-resize frame was dropped. No force-promote needed: we never have + /// to guess: a real paint into the new surface always promotes it. Main-thread only. private func resizeWatchdog(_ gen: UInt64) { bufferLock.lock() let isHidden = hidden let active = ResizeWatchdogPolicy.shouldKeepWaiting( inFlight: resizeInFlight, gen: gen, currentGen: resizeGen) bufferLock.unlock() - // The SIZE-GATED promotion in handleFrame is now the ONLY promoter: it refuses a present - // whose composited dims don't match the new surface, so the watchdog must NOT force-promote - // — that would show the renderer's pre-re-raster WRONG-SCALE frame (too big/small) or a - // blank surface (the old behavior + F-4 hidden-guard are superseded by the size gate). If a - // correct frame already landed, handleFrame cleared resizeInFlight and `active` is false → - // stop. Otherwise re-kick a possibly-dropped frame (the 16ms begin-frame pump also drives - // the re-raster); F-6 liveness recovers a genuinely wedged tile. The texture meanwhile keeps - // the last correct-scale buffer (geometrically right, momentarily softer) — never wrong. + // If a paint already landed in the pending surface, handleFrame promoted it + cleared + // resizeInFlight → `active` is false → stop. Otherwise re-kick (opInvalidate forces cef_host + // to repaint the pending surface; the 16ms begin-frame pump also drives it), and the next + // present promotes via sid match. The texture meanwhile keeps serving the last good surface + // scaled to the tile (momentarily soft if the box grew) — never blank, never frozen-wrong. guard active else { return } // While hidden the pump is gated off, so opInvalidate can't paint — skip the nudge but keep // the watchdog alive; the native un-hide repaint (F-1) drives a real present that promotes. @@ -613,37 +609,35 @@ final class CefWebSession: NSObject, FlutterTexture { // blank new one. A present for the old/current surface just advances the frame. var promotedSid: UInt32 = 0 var promotedW = 0, promotedH = 0 - if payload.count >= 12 { + if payload.count >= 4 { let psid = (UInt32(payload[0]) << 24) | (UInt32(payload[1]) << 16) | (UInt32(payload[2]) << 8) | UInt32(payload[3]) - let srcW = Int((UInt32(payload[4]) << 24) | (UInt32(payload[5]) << 16) - | (UInt32(payload[6]) << 8) | UInt32(payload[7])) - let srcH = Int((UInt32(payload[8]) << 24) | (UInt32(payload[9]) << 16) - | (UInt32(payload[10]) << 8) | UInt32(payload[11])) - // SIZE-GATED PROMOTION: only promote the pending (resized) surface when the present's - // COMPOSITED frame dims match the new surface (round(logical*dpr)). On a device-scale - // (zoom) resize the host swaps to the new surface synchronously while the renderer - // re-rasters async, so the FIRST present after the resize carries the renderer's - // OLD-scale frame in the new surface — promoting it renders too big/small (and can - // freeze there). Gating on dims keeps Flutter sampling the last correct-scale buffer - // (geometrically right, momentarily softer) until the re-rastered frame lands. - let expW = Int((Double(width) * dpr).rounded()) - let expH = Int((Double(height) * dpr).rounded()) - let scaleOk = abs(srcW - expW) <= 1 && abs(srcH - expH) <= 1 - // DIAG: while a resize is pending, log every present's actual composited dims vs the - // expected new-surface dims, so a soak test can see whether the size-match ever - // succeeds (if `view_src` is pool-sized, src never equals exp and the resize sticks). + let srcW = payload.count >= 12 ? Int((UInt32(payload[4]) << 24) | (UInt32(payload[5]) << 16) + | (UInt32(payload[6]) << 8) | UInt32(payload[7])) : 0 + let srcH = payload.count >= 12 ? Int((UInt32(payload[8]) << 24) | (UInt32(payload[9]) << 16) + | (UInt32(payload[10]) << 8) | UInt32(payload[11])) : 0 + // ALWAYS-LATEST PROMOTION (the unified model — see prior art: webview_cef / cefclient / + // Ultralight / video swapchains all do this): promote the pending (resized) surface as + // soon as cef_host paints INTO it (sid match), WITHOUT gating on exact composited dims. + // The surface is already the correct PHYSICAL size (we allocated it at logical×dpr); the + // content within it is scaled to the logical tile box by Flutter's Texture, so a frame + // whose page-raster momentarily lags the new dpr shows briefly SOFT — never wrong-size, + // never stuck. The old SIZE-GATED rule refused promotion until painted dims == round( + // logical×dpr) exactly, which for a STATIC page (one frame per resize, dims off by + // rounding, or the exact-match frame never re-arriving) kept serving the OLD small + // surface forever → stretched-and-frozen (live input, dead pixels). Newest sid wins; + // convergence to crisp is driven by the begin-frame pump, not a gate. if pendingBuffer != nil, ProcessInfo.processInfo.environment["FLUTTER_CEF_DEBUG"] != nil { - NSLog("[cefdiag-resize] bid=\(browserId) src=\(srcW)x\(srcH) exp=\(expW)x\(expH) " - + "match=\(scaleOk) logical=\(width)x\(height) dpr=\(dpr) " + NSLog("[cefdiag-resize] bid=\(browserId) src=\(srcW)x\(srcH) " + + "logical=\(width)x\(height) dpr=\(dpr) " + "psid=\(psid) pendSid=\(pendingSurfaceId) sidMatch=\(psid == pendingSurfaceId)") } - if let pending = pendingBuffer, psid != 0, psid == pendingSurfaceId, scaleOk { + if let pending = pendingBuffer, psid != 0, psid == pendingSurfaceId { pixelBuffer = pending pendingBuffer = nil pendingSurfaceId = 0 - resizeInFlight = false // its CORRECT-SCALE paint landed; free to send the next size + resizeInFlight = false // its paint landed; free to send the next size promotedSid = psid promotedW = width promotedH = height From 580a8529844d8b057230623ba7cac95be785c656 Mon Sep 17 00:00:00 2001 From: wenkaifan0720 Date: Sat, 27 Jun 2026 09:20:05 -0700 Subject: [PATCH 09/13] =?UTF-8?q?fix(osr):=20don't=20recreate-loop=20stati?= =?UTF-8?q?c=20idle=20tiles=20=E2=80=94=20steady-state=20liveness=20is=20n?= =?UTF-8?q?udge-only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The steady-state liveness watchdog (F-6) escalated a painted-but-idle browser to onPaintStalled → consumer recreate. A STATIC page (counter, status panel, finished form) legitimately stops producing frames once idle, and the nudge (opInvalidate) can't extract a new frame from a page with nothing to repaint — so every static tile got flagged "painted then wedged" and recreated on a ~10s loop (observed in Campus: 36 stalls / 33 browsers in one session → constant flicker on idle tiles). A converged, idle tile is HEALTHY — it is showing correct content. Fix: the sweep still NUDGES an idle established tile once (which repairs a genuinely evicted/blank VISIBLE surface — real damage produces a present), but NO LONGER escalates to onPaintStalled when the nudge yields no frame. No-frames-after-nudge on an established tile = static-idle = accept as healthy, keep serving its last good frame. Never-painted tiles are still owned by the separate first-paint watchdog (firstPresentPending); genuine renderer death by OnRenderProcessTerminated; eviction-while-hidden by the F-1 un-hide repaint. This is the research's "liveness keys on displayed==desired, not frame-flow" applied minimally. Verified in the conformance harness (HARD static pages + 16s idle holds): "painted then wedged" = 0, "accepting as healthy-static" = 90, no liveness-driven recreate (the browser-id climb in the run is the harness's own recreateStorm phase, not the watchdog). --- example/lib/conformance_harness.dart | 6 ++++- .../macos/Classes/CefProfileHost.swift | 24 ++++++++++++++----- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/example/lib/conformance_harness.dart b/example/lib/conformance_harness.dart index c713492..7d0e024 100644 --- a/example/lib/conformance_harness.dart +++ b/example/lib/conformance_harness.dart @@ -116,7 +116,11 @@ class _HarnessAppState extends State { ]; final next = order[(order.indexOf(_phase) + 1) % order.length]; _setPhase(next); - Future.delayed(const Duration(seconds: 6), _cyclePhases); + // Hold IDLE long enough (>13s = liveness staleness 10s + nudge grace 3s) to exercise the + // steady-state liveness: a static idle tile must NOT be flagged "painted then wedged" / + // recreated. Storms get 6s. + final hold = next == Phase.idle ? 16 : 6; + Future.delayed(Duration(seconds: hold), _cyclePhases); } void _setPhase(Phase p) { diff --git a/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift b/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift index c8e9228..9114ef3 100644 --- a/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift +++ b/packages/flutter_cef_macos/macos/Classes/CefProfileHost.swift @@ -809,15 +809,27 @@ final class CefProfileHost { case .healthy: break case .nudge: - // Discriminate: a healthy idle page repaints (clearing the nudge on the present); - // a wedged one stays blank. + // A browser that has PAINTED but produced no frame for a while: nudge it once with a + // full-view repaint. A genuinely wedged/evicted VISIBLE surface has real damage to + // repair, so this produces a present (recovered → healthy next cycle). A STATIC idle + // page (a counter, a finished form) has nothing new to paint, so it produces NO present + // — and that is HEALTHY, not wedged (it is showing correct content; the begin-frame + // pump simply has nothing to draw). See .declareStalled. send(c.bid, Self.opInvalidate, []) browsersLock.lock(); browsers[c.bid]?.livenessNudgedAt = now; browsersLock.unlock() case .declareStalled: - NSLog("[cef] profile '\(profileId)': browser \(c.bid) painted then wedged — reporting paintStalled (consumer may recreate)") - onPaintStalled?(c.bid) - // Re-discriminate next cycle; the consumer's recover() is bounded (kMaxCefRecreate). - browsersLock.lock(); browsers[c.bid]?.livenessNudgedAt = 0; browsersLock.unlock() + // The nudge above did NOT extract a frame. For an ESTABLISHED (already-painted) tile + // this means STATIC-IDLE, not wedged — escalating to onPaintStalled here recreate- + // looped every static tile (counter / status / checklist): paintStalled → recover → + // paint → idle → paintStalled, ~every 10s, forever (observed: 36 stalls / 33 browsers + // in one session). A converged idle tile is healthy by definition; we keep serving its + // last good frame. Do NOT recreate. (Never-painted tiles are owned by the separate + // first-paint watchdog via firstPresentPending; genuine renderer death is caught by + // OnRenderProcessTerminated; eviction-while-hidden by the F-1 un-hide repaint.) Leave + // nudgedAt set so we don't re-nudge every cycle; a real future repaint clears it. + if ProcessInfo.processInfo.environment["FLUTTER_CEF_DEBUG"] != nil { + NSLog("[cef] profile '\(profileId)': browser \(c.bid) idle (no frames) — accepting as healthy-static (not recreating)") + } } } } From dfefee0597517005c83742eb6b35aba7a56dee2c Mon Sep 17 00:00:00 2001 From: wenkaifan0720 Date: Sat, 27 Jun 2026 10:05:47 -0700 Subject: [PATCH 10/13] =?UTF-8?q?refactor(osr):=20producer-allocates=20sur?= =?UTF-8?q?face=20model=20=E2=80=94=20kill=20the=20crop/stretch/stale=20cl?= =?UTF-8?q?ass=20at=20the=20root?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit THE durable fix for the crop/stretch oscillation (the user chose this over the band-aid). ROOT CAUSE: the CONSUMER allocated the IOSurface and cef_host blitted CEF's painted view_src into it with a min(src,dst) top-left copy. When the consumer-allocated dst and cef_host's painted src disagreed in size (rounding / timing / dpr-zoom), the blit CROPPED (src>dst) or left STALE margins (src 450_000_000 // 450ms grace if wedged { - pendingBuffer = nil - pendingSurfaceId = 0 resizeInFlight = false blocked = false } @@ -270,26 +262,15 @@ final class CefWebSession: NSObject, FlutterTexture { sendResize(w, h, d) } - /// Allocate the new surface, point cef_host at it, and send the resize — marking it - /// in-flight so the next size waits for this one's present (see resize()/maybeSendNextResize). - /// Only ever called on the main thread (resize / maybeSendNextResize), so sendFrame stays - /// serialized. + /// PRODUCER-ALLOCATES: send the DESIRED geometry only — cef_host re-rasters, mints a new + /// surface sized to its own paint, and presents the new id, which handleFrame(opPresent) + /// adopts. No consumer-side IOSurface allocation, no pending-buffer handoff (the producer + /// presents a sid only once it's painted, so adopt-on-change is always non-blank). We keep + /// serving the current pixelBuffer (the old surface, held alive by its CVPixelBuffer) until + /// the new present adopts — same no-flash guarantee, sourced from the producer. + /// Main-thread only, so sendFrame stays serialized. private func sendResize(_ w: Int, _ h: Int, _ d: CGFloat) { - // Create the new surface OUTSIDE the lock (expensive) at the requested density. H4: - // publish surface id + new dims ATOMICALLY in one bufferLock section so a concurrent - // host read (createSnapshot on the reader thread) can't see new dims with the old id. - guard let (surf, buffer) = makeBuffers(w, h, d) else { return } - let sid = IOSurfaceGetID(surf) - guard sid != 0 else { return } - // Resize-flash fix: point the host at the NEW surface (ioSurface drives surfaceId / - // createSnapshot → cef_host paints into it) and adopt the new dims, but DON'T swap - // the live `pixelBuffer` — keep serving the OLD surface to Flutter (the old - // CVPixelBuffer retains its IOSurface, so it stays valid) until cef_host paints the - // new one. The new buffer is promoted in handleFrame(opPresent) on the matching present. bufferLock.lock() - ioSurface = surf - pendingBuffer = buffer - pendingSurfaceId = sid width = w height = h dpr = d @@ -301,11 +282,9 @@ final class CefWebSession: NSObject, FlutterTexture { var payload = [UInt8]() appendU32(&payload, UInt32(w)) appendU32(&payload, UInt32(h)) - appendU32(&payload, sid) appendF64(&payload, Double(d)) // cef_host updates slot->dpr → re-renders at new density sendFrame(Self.opResize, payload) - // Re-kick this resize if its present never lands (see resizeWatchdog). During a smoothly - // advancing drag gen keeps moving and this no-ops; it only bites a genuine wedge. + // Re-kick if cef_host hasn't produced a new-size paint (and thus a new present to adopt). DispatchQueue.main.asyncAfter(deadline: .now() + 0.08) { [weak self] in self?.resizeWatchdog(gen) } @@ -484,10 +463,7 @@ final class CefWebSession: NSObject, FlutterTexture { bufferLock.lock() let tid = textureId textureId = 0 - pixelBuffer = nil - ioSurface = nil - pendingBuffer = nil // drop any un-promoted resized surface - pendingSurfaceId = 0 + pixelBuffer = nil // drops the consumer's ref on the producer-owned surface (CVPixelBuffer deinit) resizeInFlight = false pendingRequestedW = 0 pendingRequestedH = 0 @@ -497,35 +473,16 @@ final class CefWebSession: NSObject, FlutterTexture { // MARK: Buffers - /// H4: CREATE an IOSurface + CVPixelBuffer for (w,h) but do NOT publish them — the - /// caller publishes surface + geometry atomically via publishBuffers so a concurrent - /// createSnapshot()/copyPixelBuffer never sees a surface and dims out of sync. - private func makeBuffers(_ w: Int, _ h: Int, _ scale: CGFloat) -> (IOSurfaceRef, CVPixelBuffer)? { - // Allocate at PHYSICAL (Retina) resolution = logical * dpr, so the texture - // is crisp on HiDPI displays; cef_host renders the OSR buffer at the same - // scale (via GetScreenInfo.device_scale_factor). 64-byte-aligned stride keeps - // the IOSurface Metal/CVPixelBuffer-compatible. `scale` is passed (not read from - // self.dpr) so a resize that changes dpr allocates at the NEW density. Clamp to the - // same ceiling cef_host enforces (dpr<=8): the shipped widget already clamps, but the - // public CefWebController.resize(dpr:) does not, and an unclamped dpr is an O(dpr^2) - // allocation AND would desync the host scale (host caps at 8, surface wouldn't). - let s = min(max(Double(scale), 0.5), 8.0) - let pw = max(1, Int((Double(w) * s).rounded())) - let ph = max(1, Int((Double(h) * s).rounded())) - let bytesPerRow = ((pw * 4) + 63) & ~63 - let props: [CFString: Any] = [ - kIOSurfaceWidth: pw, - kIOSurfaceHeight: ph, - kIOSurfaceBytesPerElement: 4, - kIOSurfaceBytesPerRow: bytesPerRow, - kIOSurfaceAllocSize: bytesPerRow * ph, - kIOSurfacePixelFormat: kCVPixelFormatType_32BGRA, - "IOSurfaceIsGlobal" as CFString: true, // resolvable cross-process by id - ] - guard let surf = IOSurfaceCreate(props as CFDictionary) else { - NSLog("[cef] IOSurfaceCreate failed \(w)x\(h)") - return nil - } + /// PRODUCER-ALLOCATES: wrap a producer-owned IOSurface (looked up by the id cef_host sent in + /// a present) in a CVPixelBuffer for Flutter. cef_host created the surface IOSurfaceIsGlobal, + /// so IOSurfaceLookup resolves it cross-process. IOSurfaceLookup is CF_RETURNS_RETAINED, so + /// Swift manages that +1 and releases it when `surf` leaves scope; the CVPixelBuffer takes its + /// OWN retain on the surface for as long as Flutter may sample it — so the surface lives until + /// this CVPixelBuffer is overwritten/niled (consumer ref) AND cef_host has released its ref. + /// Returns nil if the id no longer resolves (producer freed it racing a close) — the caller + /// keeps the current buffer and retries on the next present. Called under bufferLock. + private func adoptSurfaceLocked(_ sid: UInt32) -> CVPixelBuffer? { + guard let surf = IOSurfaceLookup(sid) else { return nil } var pbOut: Unmanaged? let attrs: [CFString: Any] = [ kCVPixelBufferMetalCompatibilityKey: true, @@ -533,31 +490,11 @@ final class CefWebSession: NSObject, FlutterTexture { ] let rc = CVPixelBufferCreateWithIOSurface( kCFAllocatorDefault, surf, attrs as CFDictionary, &pbOut) - guard rc == kCVReturnSuccess, let buffer = pbOut?.takeRetainedValue() else { - NSLog("[cef] CVPixelBufferCreateWithIOSurface failed rc=\(rc)") + guard rc == kCVReturnSuccess, let pb = pbOut?.takeRetainedValue() else { + NSLog("[cef] adoptSurface: CVPixelBufferCreateWithIOSurface failed rc=\(rc) sid=\(sid)") return nil } - // NOTE: no success log here — makeBuffers runs once PER resize step (~60/s during a drag), - // and a synchronous NSLog on that hot path measurably hurts resize smoothness. - return (surf, buffer) - } - - /// H4: publish a new (surface, buffer, width, height) as ONE atomic update, so a - /// concurrent createSnapshot()/copyPixelBuffer never observes the new surface with - /// the old dims (or vice-versa). Returns the new surface id. The old IOSurface/ - /// CVPixelBuffer are released by the overwrite. - @discardableResult - private func publishBuffers(_ surf: IOSurfaceRef, _ buffer: CVPixelBuffer, - _ w: Int, _ h: Int) -> UInt32 { - bufferLock.lock() - ioSurface = surf - pixelBuffer = buffer - width = w - height = h - let sid = IOSurfaceGetID(surf) - bufferLock.unlock() - notifySurface(sid, w, h) - return sid + return pb } /// WebRTC frame export: notify any consumer that the live surface (re)allocated, so it @@ -573,23 +510,23 @@ final class CefWebSession: NSObject, FlutterTexture { Int((Double(logicalH) * s).rounded())) } - /// Re-emit the current live surface to a just-attached onSurface consumer. The init - /// publish fires before the plugin wires onSurface, so the plugin calls this right - /// after assigning the callback to deliver the initial surface. + /// Re-emit the current live surface to a just-attached onSurface (WebRTC) consumer. With + /// producer-allocates the live surface is whatever backs the current pixelBuffer (nil until + /// the first present adopts one), so read it from there. func emitCurrentSurface() { bufferLock.lock() - let surf = ioSurface + let sid = pixelBuffer.flatMap { CVPixelBufferGetIOSurface($0) } + .map { IOSurfaceGetID($0.takeUnretainedValue()) } ?? 0 let w = width, h = height bufferLock.unlock() - if let surf = surf { notifySurface(IOSurfaceGetID(surf), w, h) } + if sid != 0 { notifySurface(sid, w, h) } } - /// H4: read (w, h, dpr, surfaceId) as ONE consistent tuple under a single bufferLock - /// acquisition — the host builds opCreateBrowser from this so its payload can't - /// capture a torn mix of stale dims + a freshly-reallocated surface id. - func createSnapshot() -> (w: Int, h: Int, dpr: CGFloat, sid: UInt32) { + /// Read (w, h, dpr) as ONE consistent tuple under a single bufferLock acquisition — the host + /// builds opCreateBrowser from this. Producer-allocates: no surface id (cef_host mints its own). + func createSnapshot() -> (w: Int, h: Int, dpr: CGFloat) { bufferLock.lock(); defer { bufferLock.unlock() } - return (width, height, dpr, ioSurface.map { IOSurfaceGetID($0) } ?? 0) + return (width, height, dpr) } // MARK: Inbound frames @@ -603,10 +540,15 @@ final class CefWebSession: NSObject, FlutterTexture { // Read textureId under bufferLock — dispose() writes it under the same // lock on the main thread, so this avoids a data race on the Int64. bufferLock.lock() - // Resize-flash fix: the present is tagged with the surface id cef_host painted - // (BE u32). If it's our pending (resized) surface, promote it to live now — we - // kept serving the old surface until this exact frame so Flutter never sampled the - // blank new one. A present for the old/current surface just advances the frame. + // PRODUCER-ALLOCATES ADOPT: the present names the PRODUCER-OWNED surface id cef_host just + // painted (+ its physical dims). The producer mints a new surface (new id) whenever it + // re-rasters at a new size/dpr, so a present whose id differs from the one currently + // backing our pixelBuffer means "adopt the new surface." We IOSurfaceLookup it (cross- + // process, resolvable because cef_host created it IOSurfaceIsGlobal) and wrap it in a + // CVPixelBuffer. src==dst by construction (cef_host sized the surface to its own paint), + // so there is no crop/stretch/stale class anymore. We keep serving the current pixelBuffer + // until the new one is wrapped (no flash), and the producer only ever presents a sid after + // it has painted that surface (never blank). Newest id wins. var promotedSid: UInt32 = 0 var promotedW = 0, promotedH = 0 if payload.count >= 4 { @@ -616,31 +558,23 @@ final class CefWebSession: NSObject, FlutterTexture { | (UInt32(payload[6]) << 8) | UInt32(payload[7])) : 0 let srcH = payload.count >= 12 ? Int((UInt32(payload[8]) << 24) | (UInt32(payload[9]) << 16) | (UInt32(payload[10]) << 8) | UInt32(payload[11])) : 0 - // ALWAYS-LATEST PROMOTION (the unified model — see prior art: webview_cef / cefclient / - // Ultralight / video swapchains all do this): promote the pending (resized) surface as - // soon as cef_host paints INTO it (sid match), WITHOUT gating on exact composited dims. - // The surface is already the correct PHYSICAL size (we allocated it at logical×dpr); the - // content within it is scaled to the logical tile box by Flutter's Texture, so a frame - // whose page-raster momentarily lags the new dpr shows briefly SOFT — never wrong-size, - // never stuck. The old SIZE-GATED rule refused promotion until painted dims == round( - // logical×dpr) exactly, which for a STATIC page (one frame per resize, dims off by - // rounding, or the exact-match frame never re-arriving) kept serving the OLD small - // surface forever → stretched-and-frozen (live input, dead pixels). Newest sid wins; - // convergence to crisp is driven by the begin-frame pump, not a gate. - if pendingBuffer != nil, - ProcessInfo.processInfo.environment["FLUTTER_CEF_DEBUG"] != nil { - NSLog("[cefdiag-resize] bid=\(browserId) src=\(srcW)x\(srcH) " - + "logical=\(width)x\(height) dpr=\(dpr) " - + "psid=\(psid) pendSid=\(pendingSurfaceId) sidMatch=\(psid == pendingSurfaceId)") - } - if let pending = pendingBuffer, psid != 0, psid == pendingSurfaceId { - pixelBuffer = pending - pendingBuffer = nil - pendingSurfaceId = 0 - resizeInFlight = false // its paint landed; free to send the next size - promotedSid = psid - promotedW = width - promotedH = height + // Dedupe against the surface ACTUALLY backing the live pixelBuffer (not a cached var), + // so a stable producer sid Lookups exactly ZERO times after the first adopt. + let curSid = pixelBuffer.flatMap { CVPixelBufferGetIOSurface($0) } + .map { IOSurfaceGetID($0.takeUnretainedValue()) } ?? 0 + if psid != 0, psid != curSid { + if let pb = adoptSurfaceLocked(psid) { + pixelBuffer = pb + resizeInFlight = false // a new-size paint landed and we adopted it + promotedSid = psid + promotedW = srcW > 0 ? srcW : Int((Double(width) * dpr).rounded()) + promotedH = srcH > 0 ? srcH : Int((Double(height) * dpr).rounded()) + if ProcessInfo.processInfo.environment["FLUTTER_CEF_DEBUG"] != nil { + NSLog("[cefdiag-resize] bid=\(browserId) ADOPT psid=\(psid) src=\(srcW)x\(srcH) " + + "logical=\(width)x\(height) dpr=\(dpr)") + } + } + // adopt failed (Lookup nil — producer raced a free): keep current buffer, next present retries. } } let tid = textureId diff --git a/packages/flutter_cef_macos/native/cef_host/main.mm b/packages/flutter_cef_macos/native/cef_host/main.mm index 8570cd2..3eb0e57 100644 --- a/packages/flutter_cef_macos/native/cef_host/main.mm +++ b/packages/flutter_cef_macos/native/cef_host/main.mm @@ -121,9 +121,9 @@ constexpr uint8_t kOpCreated = 0x1c; // {} H3: OnAfterCreated — browser is up; host's pacer sends the next create constexpr uint8_t kOpCreateFailed = 0x1d; // {} H7: async CreateBrowser dispatch failed; host drops the session constexpr uint8_t kOpPointer = 0x10; -constexpr uint8_t kOpResize = 0x11; +constexpr uint8_t kOpResize = 0x11; // {u32 w}{u32 h}{f64 dpr} — producer-allocates: no sid constexpr uint8_t kOpKey = 0x12; -constexpr uint8_t kOpCreateBrowser = 0x13; // {u32 w}{u32 h}{f64 dpr}{u32 iosurfaceId}{utf8 url}; frame browserId = NEW id +constexpr uint8_t kOpCreateBrowser = 0x13; // {u32 w}{u32 h}{f64 dpr}{utf8 url}; producer-allocates (no sid); frame browserId = NEW id constexpr uint8_t kOpShutdown = 0x14; // {} tear down the whole PROCESS (all browsers); frame browserId 0 constexpr uint8_t kOpDisposeBrowser = 0x15; // {} close ONE browser (target = frame browserId); process survives constexpr uint8_t kOpNavigate = 0x20; @@ -181,7 +181,12 @@ // Guards surface / width / height / dpr / popup_* for THIS browser. Per-slot // (not a single global) so paints on independent browsers don't contend. std::mutex surface_mutex; - IOSurfaceRef surface = nullptr; // host-shared IOSurface we paint into + IOSurfaceRef surface = nullptr; // host-OWNED IOSurface we paint into (producer-allocates; + // minted lazily in EnsureSurfaceForPaint on the first paint) + // Set under surface_mutex in OnBeforeClose BEFORE nulling `surface`, so a paint racing teardown + // (EnsureSurfaceForPaint) doesn't re-mint a surface for a closing browser (which would leak — + // OnBeforeClose already released the last one). Producer-allocates lifetime guard. + bool closing = false; // Cached Metal wrap of `surface` for the GPU-blit DEST. Wrapping it fresh every // frame is pure churn (the surface is stable except on resize), so cache it and // recreate only when the wrapped IOSurface id changes. Released wherever `surface` @@ -581,6 +586,10 @@ void SendPresentLocked(int srcW, int srcH) { void OnPaint(CefRefPtr, PaintElementType type, const RectList&, const void* buffer, int width, int height) override { std::lock_guard lock(slot_->surface_mutex); + // PRODUCER-ALLOCATES (software path): mint/resize the surface to the painted VIEW dims + // before the guard, mirroring OnAcceleratedPaint — else the surface is never created and + // nothing paints. A POPUP paint composites onto the existing view surface (don't resize). + if (type == PET_VIEW) EnsureSurfaceForPaint(width, height); if (!slot_->surface) return; if (IOSurfaceLock(slot_->surface, 0, nullptr) != kIOReturnSuccess) return; uint8_t* dst = static_cast(IOSurfaceGetBaseAddress(slot_->surface)); @@ -656,11 +665,55 @@ void CopyAccelToPopupBuf(IOSurfaceRef src) { IOSurfaceUnlock(src, kIOSurfaceLockReadOnly, nullptr); } + // PRODUCER-ALLOCATES: ensure slot_->surface is EXACTLY sw x sh — the dims CEF actually + // painted (view_src). cef_host owns this surface (it mints it, the consumer adopts it by id + // from the present). Because the blit dst is then the same size as the src, the copy is 1:1 + // and can NEVER crop (src>dst) or leave stale margins (srcsurface_mutex. + // Releases cef_host's ref on the OLD surface immediately; the consumer's CVPixelBuffer keeps + // the old one alive (independent refcount) until it adopts the new id, so no UAF. + void EnsureSurfaceForPaint(int sw, int sh) { + if (sw < 1 || sh < 1) return; // popup-only repaint (view_src null) keeps the current surface + if (slot_->closing) return; // a paint racing teardown must not re-mint a surface (leak) + IOSurfaceRef cur = slot_->surface; + if (cur && static_cast(IOSurfaceGetWidth(cur)) == sw && + static_cast(IOSurfaceGetHeight(cur)) == sh) { + return; // already the right size — the common steady-state path, zero allocation + } + const size_t bpr = ((static_cast(sw) * 4) + 63) & ~static_cast(63); + NSDictionary* props = @{ + (id)kIOSurfaceWidth : @(sw), + (id)kIOSurfaceHeight : @(sh), + (id)kIOSurfaceBytesPerElement : @(4), + (id)kIOSurfaceBytesPerRow : @(static_cast(bpr)), + (id)kIOSurfaceAllocSize : @(static_cast(bpr * sh)), + (id)kIOSurfacePixelFormat : @(0x42475241), // 'BGRA' = kCVPixelFormatType_32BGRA + @"IOSurfaceIsGlobal" : @YES, // resolvable cross-process by id (consumer Lookups it) + }; + IOSurfaceRef fresh = IOSurfaceCreate((__bridge CFDictionaryRef)props); + if (!fresh) { + SendLog(slot_->browser_id, "EnsureSurfaceForPaint: IOSurfaceCreate failed"); + return; // keep the old surface; next paint retries + } + slot_->surface = fresh; // cef_host's +1 (mint, not Lookup) + [slot_->dst_mtl release]; + slot_->dst_mtl = nil; + slot_->dst_mtl_sid = 0; // the cached Metal wrap pointed at `cur`; rebuilt this same paint + if (cur) CFRelease(cur); // drop cef_host's +1 on the old; consumer's CVPixelBuffer still holds it + } + // Software-composite the view (optional GPU surface, stride-aware) and the open // popup into the host-allocated slot_->surface and present it. Used only while // a