diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 31b43ce..4184d02 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -576,17 +576,17 @@ jobs: run: python scripts/repro/const_cse_differential.py frame-slot-dce-242-oracle: - name: frame-slot DCE flag-on execution oracle - # VCR-RA frame-slot DCE (#242): EXECUTE flat_flight compiled with - # SYNTH_STACK_FWD=1 (stack-reload forwarding + dead-frame-store elimination) + name: frame-slot DCE default+optout execution oracle + # VCR-RA frame-slot DCE (#242): EXECUTE flat_flight (stack-reload forwarding + + # dead-frame-store elimination, DEFAULT-ON since the #242 feature-loop flip) # under unicorn (UC_ARCH_ARM / Thumb) and diff flight_algo's return value vs - # wasmtime across several sensor inputs, with linear memory seeded exactly as - # wasmtime's. The two paired passes turn frame reloads into register moves and - # remove the now-dead stores (flight_algo sp-traffic 20→7, 139→135 insns). - # The flag ships DEFAULT-OFF (off ⇒ byte-identical; the job also asserts the - # flag-off .text is byte-identical across builds), so nothing else exercises - # this optimized-path transform. Isolated job: emulation deps pip-installed - # here ONLY. + # wasmtime across several sensor inputs, in BOTH the shipped default AND the + # SYNTH_NO_STACK_FWD=1 opt-out (a default flip is only safe if the shipped path + # AND its rollback both match), with linear memory seeded exactly as wasmtime's. + # Also asserts the two configs emit DIFFERENT bytes (the flip is engaged). The + # paired passes turn frame reloads into register moves and remove the now-dead + # stores (flight_algo sp-traffic 20→7, 139→135 insns). Isolated job: emulation + # deps pip-installed here ONLY. runs-on: ubuntu-latest steps: - uses: actions/checkout@v7 diff --git a/artifacts/verified-codegen-roadmap.yaml b/artifacts/verified-codegen-roadmap.yaml index 943fd86..4d35383 100644 --- a/artifacts/verified-codegen-roadmap.yaml +++ b/artifacts/verified-codegen-roadmap.yaml @@ -214,6 +214,31 @@ artifacts: differential (resolve the internal BL, run flag-on vs wasmtime) as the gate, (c) re-freeze + G474RE silicon. This pass landed the frozen-safe fixture + finding; flag-off bit-identical, no codegen change. + + STACK-FWD + FRAME-SLOT-DCE FLIP TO DEFAULT-ON (#242 feature loop): the two + paired lowering passes — `forward_stack_reloads` (a `local.set; local.get` + reload `ldr rY,[sp,#N]` whose value still lives in `rX` becomes `mov rY,rX`) + and `eliminate_dead_frame_stores` (the now-dead `str rX,[sp,#N]` whose slot is + overwritten-before-read is removed) — shipped flag-off in PR #514/#515 and are + now DEFAULT-ON. Escape hatch `SYNTH_NO_STACK_FWD=1` restores the frame-resident + bytes. Gated exactly like the cmp→select (v0.13.0) and local-promotion + (v0.14.0) flips: RESULTS re-verified bit-identical on every frozen anchor + (control_step 0x00210A55 13/13; flat AND inlined flight_algo 0x07FDF307 — + flight_seam_differential.py MATCH both) while .text shrank on the SHIPPED + --relocatable path (flight_seam 774→738, flight_seam_flat 910→878; control_step + unchanged — no spurious slot reuse; signed_div_const + all RV32 unchanged, + ARM-only). Frozen goldens re-frozen + an escape-hatch gate asserts the opt-out + restores the pre-flip bytes. SOUNDNESS: overwrite-only DCE (dead only when a + later store to the same immediate slot overwrites it with no intervening read; + reaching the function end does NOT count), sub-word sp accesses (ldrb/ldrh/…) + as blockers (advisor-caught #483-class hole, test verified failing pre-fix); 8 + unit tests pin the boundaries. Instruction/memory proxy: flight_algo sp-traffic + 20→7, 139→135 insns — the measured CYCLE number is gale's G474RE (post-ship + confirmation, the cmp→select silicon-gate-waiver precedent). Broad oracle: + `cargo test --workspace` green under the new default (wast/spec suite is + compile-only, so gale silicon is the broad-execution check). NOT flipped: + const-CSE (SYNTH_CONST_CSE) stays flag-off — its alias-eviction prerequisite is + open and it is inert on flat_flight. status: implemented tags: [codegen, register-allocation, ssa, regalloc2, track-a, release-v0.11.40] links: diff --git a/crates/synth-backend/src/arm_backend.rs b/crates/synth-backend/src/arm_backend.rs index 2334660..904fb19 100644 --- a/crates/synth-backend/src/arm_backend.rs +++ b/crates/synth-backend/src/arm_backend.rs @@ -567,11 +567,15 @@ fn compile_wasm_to_arm( // synth lowers every wasm local to a frame slot, so `local.set; local.get` emits // `str rX,[sp,#N]; … ; ldr rY,[sp,#N]`; when rX still holds the value the reload // (a ~2-cycle M4 load) becomes `mov rY,rX`. Removal-of-a-load + rename only ⇒ no - // new instruction form and no label/offset change. BEHIND `SYNTH_STACK_FWD=1` - // (opt-in, off by default ⇒ bit-identical) while it is validated against the - // execution differential + gale's G474RE bench — the same gated path the - // cmp→select flip took before shipping default-on in v0.13.0. - let arm_instrs = if std::env::var("SYNTH_STACK_FWD").is_ok() { + // new instruction form and no label/offset change. DEFAULT-ON (#242 feature + // loop): validated bit-identical RESULTS on every frozen anchor (control_step + // 0x00210A55 13/13, flat+inlined flight_algo 0x07FDF307) with .text reduced on + // the shipped --relocatable path, plus 8 unit tests + the frame_slot_dce + // execution differential — the same gated path cmp→select took to default-on in + // v0.13.0 (G474RE silicon confirms perf post-ship). Escape hatch: + // `SYNTH_NO_STACK_FWD=1` restores the frame-resident bytes (frozen-old goldens). + let stack_fwd = std::env::var("SYNTH_NO_STACK_FWD").is_err(); + let arm_instrs = if stack_fwd { let (out, fwd) = synth_synthesis::liveness::forward_stack_reloads(&arm_instrs); if std::env::var("SYNTH_FUSE_STATS").is_ok() { eprintln!("[stack-fwd] {fwd} stack reload(s) forwarded to register moves"); @@ -584,8 +588,8 @@ fn compile_wasm_to_arm( // VCR-RA frame-slot DCE (#242): once `forward_stack_reloads` has turned the // reloads of a spill slot into register moves, the `str rX,[sp,#N]` that fed // them is a dead store — its slot is never loaded again. Remove it. Pairs - // with (and only pays after) SYNTH_STACK_FWD, so it shares that flag. - let arm_instrs = if std::env::var("SYNTH_STACK_FWD").is_ok() { + // with (and only pays after) stack-reload forwarding, so it shares the flag. + let arm_instrs = if stack_fwd { let (out, n) = synth_synthesis::liveness::eliminate_dead_frame_stores(&arm_instrs); if std::env::var("SYNTH_FUSE_STATS").is_ok() { eprintln!("[frame-slot-dce] {n} dead frame store(s) removed"); diff --git a/crates/synth-cli/tests/frozen_codegen_bytes.rs b/crates/synth-cli/tests/frozen_codegen_bytes.rs index b2fd3cb..11fcefc 100644 --- a/crates/synth-cli/tests/frozen_codegen_bytes.rs +++ b/crates/synth-cli/tests/frozen_codegen_bytes.rs @@ -69,6 +69,7 @@ fn text_sha256(wasm: &str, backend: &str, target: &str) -> (String, usize) { .env_remove("SYNTH_NO_CMP_SELECT_FUSE") .env_remove("SYNTH_NO_LOCAL_PROMOTE") .env_remove("SYNTH_NO_IMM_SHIFT_FOLD") + .env_remove("SYNTH_NO_STACK_FWD") .env_remove("SYNTH_CONST_CSE") .args([ "compile", @@ -130,16 +131,20 @@ fn assert_frozen(cases: &[(&str, &str, usize)], backend: &str, target: &str) { /// the `.py` differentials cover): control_step ↔ 0x00210A55, flight_seam_flat ↔ /// flat+inlined flight_algo 0x07FDF307, plus flight_seam and the div seam. /// -/// Goldens RE-FROZEN for v0.15.0 (#390): immediate-shift folding is now default-on -/// (on top of v0.13.0 cmp→select + v0.14.0 local promotion), so these lock the -/// folded+promoted+fused .text. The execution RESULTS are preserved — re-verified -/// on this commit: control_step still 0x00210A55 (control_step_differential.py -/// 13/13), flat+inlined flight_algo still 0x07FDF307 (flight_seam_differential.py -/// MATCH). .text shrank again (constant shift-amount `movw`s removed): control_step -/// 316→304, flight_seam 866→774, flight_seam_flat 1006→910 (−200 B total); -/// signed_div_const (no register-shift folds) unchanged. Measured −2 cyc/call on the -/// dissolved hot path (.text 100→90 B on gust_mix). Prior promotion goldens were on -/// main @ 6b46f09 (v0.14.0), 2026-06-24. +/// Goldens RE-FROZEN for the SYNTH_STACK_FWD flip (#242 feature loop): stack-reload +/// forwarding + frame-slot dead-store elimination are now default-on (on top of +/// v0.13.0 cmp→select + v0.14.0 local promotion + v0.15.0 immediate-shift folding), +/// so these lock the forwarded+DCE'd .text. The execution RESULTS are preserved — +/// re-verified on this commit: control_step still 0x00210A55 +/// (control_step_differential.py 13/13), flat+inlined flight_algo still 0x07FDF307 +/// (flight_seam_differential.py MATCH on BOTH flat and inlined). .text shrank again +/// (dead frame stores removed, reloads → register moves): flight_seam 774→738, +/// flight_seam_flat 910→878 (−68 B); control_step (no spurious slot reuse) and +/// signed_div_const unchanged. Instruction/memory-op proxy: flight_algo sp-traffic +/// 20→7, 139→135 insns — the measured CYCLE number is gale's G474RE (post-ship). +/// Escape hatch `SYNTH_NO_STACK_FWD=1` restores the prior goldens (asserted by +/// `frozen_fixtures_stack_fwd_escape_hatch_restores_old_bytes`). Prior (flag-off) +/// goldens were flight_seam 9e73eea3…/774, flight_seam_flat 887ea546…/910. #[test] fn frozen_fixtures_text_is_bit_identical_oracle_001() { let cases = [ @@ -150,13 +155,13 @@ fn frozen_fixtures_text_is_bit_identical_oracle_001() { ), ( "flight_seam.wasm", - "9e73eea3867ba085820329951e84a7d650c38a7fc78d9d03a6a83d02963f9670", - 774, + "dce728b48e14aa9187774799c0b268c6ad60be6dc0fa3e7cc9458c17dc65403b", + 738, ), ( "flight_seam_flat.wasm", - "887ea546429a4569112147fdc94b0ba90f02a6ccd2b511aa2ca48dab017dbc2c", - 910, + "0665e623f33457479940046d57a4b7ec02416a61bcc6ec71ea3556ed5b3cb568", + 878, ), ( "signed_div_const.wasm", @@ -167,6 +172,73 @@ fn frozen_fixtures_text_is_bit_identical_oracle_001() { assert_frozen(&cases, "arm", "cortex-m4"); } +/// ESCAPE-HATCH GATE for the SYNTH_STACK_FWD flip (#242). Proves the opt-out +/// actually rolls back: with `SYNTH_NO_STACK_FWD=1` the two fixtures the flip moved +/// restore their PRE-flip goldens byte-for-byte. This is both the rollback proof and +/// a tripwire — if forwarding/DCE ever leaks into the opt-out path, the old hashes +/// stop matching. (control_step / signed_div_const are unaffected by the flip, so +/// they are not re-checked here; the default gate above already locks them.) +#[test] +fn frozen_fixtures_stack_fwd_escape_hatch_restores_old_bytes() { + // (fixture, PRE-flip golden sha256, PRE-flip len) — the v0.15.0 goldens. + let old = [ + ( + "flight_seam.wasm", + "9e73eea3867ba085820329951e84a7d650c38a7fc78d9d03a6a83d02963f9670", + 774usize, + ), + ( + "flight_seam_flat.wasm", + "887ea546429a4569112147fdc94b0ba90f02a6ccd2b511aa2ca48dab017dbc2c", + 910, + ), + ]; + for &(wasm, golden, golden_len) in &old { + let elf = format!("/tmp/frozen_nofwd_{wasm}.elf"); + let out = Command::new(synth()) + .env("SYNTH_NO_STACK_FWD", "1") + .env_remove("SYNTH_NO_CMP_SELECT_FUSE") + .env_remove("SYNTH_NO_LOCAL_PROMOTE") + .env_remove("SYNTH_NO_IMM_SHIFT_FOLD") + .env_remove("SYNTH_CONST_CSE") + .args([ + "compile", + fixture(wasm).to_str().unwrap(), + "-o", + &elf, + "-b", + "arm", + "--target", + "cortex-m4", + "--all-exports", + "--relocatable", + ]) + .output() + .expect("run synth"); + assert!(out.status.success(), "compile failed for {wasm}"); + let bytes = std::fs::read(&elf).expect("read elf"); + let obj = object::File::parse(&*bytes).expect("parse elf"); + let data = obj + .section_by_name(".text") + .expect(".text") + .data() + .expect("read .text"); + assert_eq!( + data.len(), + golden_len, + "{wasm}: SYNTH_NO_STACK_FWD must restore the pre-flip length" + ); + let hex: String = Sha256::digest(data) + .iter() + .map(|b| format!("{b:02x}")) + .collect(); + assert_eq!( + hex, golden, + "{wasm}: SYNTH_NO_STACK_FWD=1 must restore the pre-flip bytes (rollback broken)" + ); + } +} + /// THE RV32 GATE. The RISC-V backend (`synth-backend-riscv`) is an INDEPENDENT /// codegen path with its own miscompile history (#220/#223/#226) — the ARM gate /// gives it zero protection, and its frozen RV32 result-identity invariants diff --git a/scripts/repro/frame_slot_dce_differential.py b/scripts/repro/frame_slot_dce_differential.py index 6fd6c59..b140332 100755 --- a/scripts/repro/frame_slot_dce_differential.py +++ b/scripts/repro/frame_slot_dce_differential.py @@ -14,14 +14,14 @@ On flat_flight's `flight_algo` this cuts sp-relative memory traffic 20→7 (9 loads forwarded, 4 dead stores removed) and 139→135 instructions. -This is byte-CHANGING codegen, so the flag ships DEFAULT-OFF (off ⇒ byte-identical; -the frozen differential hashes enforce it on the shipped --relocatable path). This -harness is the correctness oracle for the flag-ON path: it compiles flat_flight -with `SYNTH_STACK_FWD=1`, runs each export under unicorn (UC_ARCH_ARM / Thumb) with -linear memory seeded exactly as wasmtime's, and asserts the returned value is -bit-identical to wasmtime ground truth across several sensor inputs. It also -asserts the flag-OFF `.text` is byte-identical to a flag-never-read build (the -frozen-safety half) and reports the instruction-count reduction. +As of the #242 feature loop both passes are DEFAULT-ON (the win lands on the +shipped --relocatable path); `SYNTH_NO_STACK_FWD=1` is the opt-out. This harness is +the correctness oracle for the flip: it compiles flat_flight in BOTH configs, runs +flight_algo under unicorn (UC_ARCH_ARM / Thumb) with linear memory seeded exactly +as wasmtime's, and asserts the returned value is bit-identical to wasmtime ground +truth in BOTH the default and the opt-out path across several sensor inputs (a +default flip is only safe if the shipped path AND its rollback both match). It also +asserts the two configs emit DIFFERENT bytes (proof the flip is engaged). Run (needs wasmtime + unicorn + pyelftools): SYNTH=./target/debug/synth python scripts/repro/frame_slot_dce_differential.py @@ -78,9 +78,11 @@ def s_init(vals=(100, -50, 30, 300, -200)): def compile_flat(out, fwd): + # `fwd` True = the shipped DEFAULT (stack-fwd + frame-slot DCE on); + # False = the SYNTH_NO_STACK_FWD opt-out (frame-resident, pre-flip bytes). env = {"PATH": "/usr/bin:/bin"} - if fwd: - env["SYNTH_STACK_FWD"] = "1" + if not fwd: + env["SYNTH_NO_STACK_FWD"] = "1" r = subprocess.run( [SYNTH, "compile", str(WAT), "-o", out, "-b", "arm", "--target", "cortex-m4", "--all-exports"], @@ -151,30 +153,34 @@ def main(): off_code, off_base, off_syms = load(off) on_code, on_base, on_syms = load(on) - # Frozen-safety half: a second flag-off build must be byte-identical. - compile_flat("/tmp/fsdce_off2.elf", fwd=False) - off2_code, _, _ = load("/tmp/fsdce_off2.elf") - if off_code != off2_code: - print("FAIL: flag-off .text is not deterministic/byte-identical") + # The DEFAULT (fwd-on) and the opt-out (fwd-off) must produce DIFFERENT bytes + # — proof the flip is actually engaged in the shipped default — yet identical + # RESULTS. (If these match, the default isn't doing anything.) + if off_code == on_code: + print("FAIL: default and SYNTH_NO_STACK_FWD opt-out emit identical bytes " + "— the flip is not engaged") sys.exit(1) - print("OK flag-off .text byte-identical across builds") + print("OK default vs SYNTH_NO_STACK_FWD opt-out: bytes differ (flip engaged)") fails = 0 # flight_algo is the optimization target — the only export with frame-slot # traffic the passes touch (DCE fires 4× there, 0× on the others) and the one - # with a clean (i32 state_ptr, i32 sensor_ptr) -> i32 ABI to drive here. The - # flag-off byte-identity check above covers the whole module's bytes. + # with a clean (i32 state_ptr, i32 sensor_ptr) -> i32 ABI to drive here. + # Correctness must hold in BOTH configs (default and opt-out) — a default flip + # is only safe if the shipped path AND its rollback both match wasmtime. exports = [e for e in ("flight_algo",) if e in on_syms] - for fn in exports: - for st, s in CASES: - gt = wasmtime_run(fn, st, s) - got = unicorn_run(on_code, on_base, on_syms[fn], st, s) - ok = isinstance(got, int) and got == gt - if not ok: - fails += 1 - tag = f"0x{got:08X}" if isinstance(got, int) else got - print(f"{'OK ' if ok else 'FAIL'} {fn:16} fwd-on = {tag} " - f"(wasmtime 0x{gt:08X})") + for label, (code, base, syms) in (("default", (on_code, on_base, on_syms)), + ("opt-out", (off_code, off_base, off_syms))): + for fn in exports: + for st, s in CASES: + gt = wasmtime_run(fn, st, s) + got = unicorn_run(code, base, syms[fn], st, s) + ok = isinstance(got, int) and got == gt + if not ok: + fails += 1 + tag = f"0x{got:08X}" if isinstance(got, int) else got + print(f"{'OK ' if ok else 'FAIL'} {fn:14} {label:8} = {tag} " + f"(wasmtime 0x{gt:08X})") print(f"\n.text instructions: off={insn_count(off_code)} " f"on={insn_count(on_code)} "