From f16c30f7fb40f976e99746d87076720caea24665 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Thu, 25 Jun 2026 07:02:23 +0200 Subject: [PATCH] feat(vcr-ra): dead-frame elimination for promoted-local leaves, flag-off (#390, #242) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `compute_local_layout` reserves a frame slot (`sub sp,#N` / `add sp,#N`) for every non-param wasm local it sees. Local promotion (v0.14.0) then homes the eligible i32 locals in registers, so for a function whose locals all promote — and which neither spills, calls, nor touches i64 / stack-passed params — those frame bytes are never accessed. The `sub`/`add sp` pair is then pure overhead (~2-3 cyc on a small leaf), AND because it writes SP it makes `shrink_callee_saved_saves` decline (that pass bails on any SP def/use). `elide_dead_frame` removes the pair when the body provably never touches SP, saving the two instructions and restoring the SP-untouched precondition the shrink pass needs — so the two passes compose. It runs BEFORE shrink in the relocatable pipeline. Safe-by-construction: fires only when NO instruction reads/writes SP except the matched frame sub/add and the prologue Push / epilogue Pop. For wasm locals that guard is exact deadness — locals are not addressable, so every other SP consumer (spills, #204 param-backing, the i64 pair-spill area, the #359 outgoing-arg region, incoming stack params) manifests as an `[sp,#off]` access the guard sees. Any such access, or any unmodeled op whose SP effect can't be confirmed absent, declines and leaves the bytes unchanged. Removal-only: no instruction is added, rewritten, or reordered. Flag-off (opt-in `SYNTH_DEAD_FRAME_ELIM=1`); default path byte-identical — the frozen byte gate stays green. Default-on flip held for on-silicon validation, like the realloc/shrink levers. Validation: - 6 unit tests (removes / declines on sp-relative / unbalanced add-sp / unmodeled sp-effect / no-frame noop / multiple epilogues). - leaf_dead_frame_differential.py: leaf3 under unicorn, flag-off==flag-on== wasmtime over 10 vectors (signed + i32-wrap edges); 36 B -> 28 B (-8 B). Both builds return cleanly via popped LR, confirming SP balance. NOTE: the push stays {r4-r8,lr} here — a,b,c land in callee-saved r4,r5,r6 + scratch r7 = 4 saved regs, which shrink pads back to the even-count {r4-r8,lr}. Trimming the push needs the locals OUT of callee-saved (caller-saved leaf homing), tracked separately as #390. Co-Authored-By: Claude Opus 4.8 --- crates/synth-backend/src/arm_backend.rs | 12 + crates/synth-synthesis/src/liveness.rs | 285 ++++++++++++++++++ scripts/repro/leaf_caller_saved.wat | 30 +- scripts/repro/leaf_dead_frame_differential.py | 136 +++++++++ 4 files changed, 452 insertions(+), 11 deletions(-) create mode 100644 scripts/repro/leaf_dead_frame_differential.py diff --git a/crates/synth-backend/src/arm_backend.rs b/crates/synth-backend/src/arm_backend.rs index 710d1f5..af24d89 100644 --- a/crates/synth-backend/src/arm_backend.rs +++ b/crates/synth-backend/src/arm_backend.rs @@ -429,6 +429,18 @@ fn compile_wasm_to_arm( stats.needs_spill ); } + // VCR-RA-002 (#390, epic #242): eliminate a provably-dead stack frame + // (`sub sp,#N`/`add sp,#N` reserved by `compute_local_layout` for locals + // that promotion homed in registers, never accessed). Removing it saves + // the two instructions AND restores the SP-untouched precondition that + // `shrink_callee_saved_saves` requires — so it must run FIRST. Flag-off + // (opt-in `SYNTH_DEAD_FRAME_ELIM=1`); off ⇒ byte-identical. Default-on + // flip held for on-silicon validation, like the realloc/shrink levers. + let out = if std::env::var("SYNTH_DEAD_FRAME_ELIM").is_ok() { + synth_synthesis::liveness::elide_dead_frame(&out).unwrap_or(out) + } else { + out + }; synth_synthesis::liveness::shrink_callee_saved_saves(&out).unwrap_or(out) } else { arm_instrs diff --git a/crates/synth-synthesis/src/liveness.rs b/crates/synth-synthesis/src/liveness.rs index 82de9bf..dcb9499 100644 --- a/crates/synth-synthesis/src/liveness.rs +++ b/crates/synth-synthesis/src/liveness.rs @@ -2390,6 +2390,141 @@ pub fn shrink_callee_saved_saves(instrs: &[ArmInstruction]) -> Option Option> { + use ArmOp::*; + + // Pass 1: locate the single frame allocation and its matching deallocs, and + // verify nothing else touches SP. + let mut frame_sub: Option<(usize, i32)> = None; + let mut frame_adds: Vec = Vec::new(); + for (i, ins) in instrs.iter().enumerate() { + match &ins.op { + Sub { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(n), + } => { + if frame_sub.is_some() { + return None; // more than one frame allocation — unmodeled + } + frame_sub = Some((i, *n)); + } + Add { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(_), + } => frame_adds.push(i), + _ => {} + } + } + let (sub_idx, frame_n) = frame_sub?; + if frame_adds.is_empty() { + return None; + } + // Every frame dealloc must restore exactly `frame_n` (balanced); a different + // immediate means this `add sp` is not the frame's counterpart → decline. + for &ai in &frame_adds { + if let Add { + op2: Operand2::Imm(n), + .. + } = &instrs[ai].op + && *n != frame_n + { + return None; + } + } + + // Pass 2: prove the frame is dead — no SP access outside the frame sub/adds + // and the push/pop. + for (i, ins) in instrs.iter().enumerate() { + if i == sub_idx || frame_adds.contains(&i) { + continue; + } + match &ins.op { + // Push/Pop move SP but never index the frame region — allowed. + // Pure label control flow has no register operands. + Push { .. } + | Pop { .. } + | Label { .. } + | B { .. } + | BOffset { .. } + | BCondOffset { .. } + | Bhs { .. } + | Blo { .. } + | Bcc { .. } => {} + Bx { rm } if *rm == Reg::LR => {} + op => { + // Unmodeled op (call/BrTable/i64-pair/...): cannot prove SP-free. + let e = reg_effect(op)?; + if e.defs.iter().chain(e.uses.iter()).any(|r| *r == Reg::SP) { + return None; + } + // SP hides inside addressing modes too. + if let Ldr { addr, .. } + | Ldrb { addr, .. } + | Ldrsb { addr, .. } + | Ldrh { addr, .. } + | Ldrsh { addr, .. } + | Str { addr, .. } + | Strb { addr, .. } + | Strh { addr, .. } = op + && (addr.base == Reg::SP || addr.offset_reg == Some(Reg::SP)) + { + return None; + } + } + } + } + + // Dead frame confirmed: drop the `sub sp` and every matching `add sp`. + let drop: BTreeSet = std::iter::once(sub_idx) + .chain(frame_adds.iter().copied()) + .collect(); + Some( + instrs + .iter() + .enumerate() + .filter(|(i, _)| !drop.contains(i)) + .map(|(_, ins)| ins.clone()) + .collect(), + ) +} + /// Defense-in-depth: before accepting a segment's rewrite, every interference /// edge is re-checked against the final assignment (independent of the /// colourer), mirroring `verify_allocation`. @@ -5462,6 +5597,156 @@ mod tests { assert_eq!(out[4], seq[4]); } + // ---- elide_dead_frame (VCR-RA-002, #390) ---- + + fn frame_sub(n: i32) -> ArmInstruction { + ins(ArmOp::Sub { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(n), + }) + } + fn frame_add(n: i32) -> ArmInstruction { + ins(ArmOp::Add { + rd: Reg::SP, + rn: Reg::SP, + op2: Operand2::Imm(n), + }) + } + + #[test] + fn elide_dead_frame_removes_unreferenced_frame() { + // Promoted-leaf shape: a `sub sp,#16` reserves slots for locals that all + // live in registers; the body never touches SP. The frame is dead. + let body_a = ins(ArmOp::Add { + rd: Reg::R4, + rn: Reg::R0, + op2: Operand2::Imm(1), + }); + let body_b = ins(ArmOp::Mov { + rd: Reg::R0, + op2: Operand2::Reg(Reg::R4), + }); + let seq = vec![ + prologue(), + frame_sub(16), + body_a.clone(), + body_b.clone(), + frame_add(16), + epilogue(), + ]; + let out = elide_dead_frame(&seq).expect("dead frame elided"); + // sub/add sp gone; everything else verbatim and in order. + assert_eq!( + out, + vec![prologue(), body_a, body_b, epilogue()], + "only the sub/add sp pair is removed" + ); + } + + #[test] + fn elide_dead_frame_declines_on_sp_relative_access() { + use crate::rules::MemAddr; + // A spill/local lives in the frame → `[sp,#off]` access → frame is LIVE. + let seq = vec![ + prologue(), + frame_sub(16), + ins(ArmOp::Str { + rd: Reg::R0, + addr: MemAddr { + base: Reg::SP, + offset: 4, + offset_reg: None, + }, + }), + frame_add(16), + epilogue(), + ]; + assert_eq!(elide_dead_frame(&seq), None); + } + + #[test] + fn elide_dead_frame_declines_on_unbalanced_add_sp() { + // An `add sp,#8` that does not match the `sub sp,#16` is some other SP + // arithmetic, not the frame's counterpart → decline conservatively. + let seq = vec![ + prologue(), + frame_sub(16), + ins(ArmOp::Mov { + rd: Reg::R4, + op2: Operand2::Imm(0), + }), + frame_add(8), + epilogue(), + ]; + assert_eq!(elide_dead_frame(&seq), None); + } + + #[test] + fn elide_dead_frame_declines_on_unmodeled_sp_effect() { + // A call sits inside the frame: `reg_effect` declines `Bl` (its SP/clobber + // effect is not modeled), so we cannot prove the frame dead → decline. + let seq = vec![ + prologue(), + frame_sub(16), + ins(ArmOp::Bl { + label: "func_1".to_string(), + }), + frame_add(16), + epilogue(), + ]; + assert_eq!(elide_dead_frame(&seq), None); + } + + #[test] + fn elide_dead_frame_noop_when_no_frame() { + // No `sub sp` at all (frame_size == 0) → nothing to elide. + let seq = vec![ + prologue(), + ins(ArmOp::Add { + rd: Reg::R4, + rn: Reg::R0, + op2: Operand2::Imm(1), + }), + epilogue(), + ]; + assert_eq!(elide_dead_frame(&seq), None); + } + + #[test] + fn elide_dead_frame_removes_across_multiple_epilogues() { + // Two return paths each restore the frame; both `add sp` are removed. + let seq = vec![ + prologue(), + frame_sub(8), + ins(ArmOp::B { + label: "L_end".to_string(), + }), + frame_add(8), + epilogue(), + ins(ArmOp::Label { + name: "L_end".to_string(), + }), + frame_add(8), + epilogue(), + ]; + let out = elide_dead_frame(&seq).expect("dead frame elided"); + assert_eq!( + out, + vec![ + prologue(), + ins(ArmOp::B { + label: "L_end".to_string(), + }), + epilogue(), + ins(ArmOp::Label { + name: "L_end".to_string(), + }), + epilogue(), + ], + ); + } + #[test] fn precolored_node_keeps_its_colour_and_constrains_neighbours() { // Triangle R0—R1—R2; pin R0 to colour 2. With k=3, R1/R2 must avoid 2 diff --git a/scripts/repro/leaf_caller_saved.wat b/scripts/repro/leaf_caller_saved.wat index 1f7f7f2..7db3ce4 100644 --- a/scripts/repro/leaf_caller_saved.wat +++ b/scripts/repro/leaf_caller_saved.wat @@ -1,17 +1,25 @@ -;; perf repro (VCR-RA-002, #428, epic #242): leaf-function prologue shrink. +;; perf repro (VCR-RA-002, #390, epic #242): leaf-function prologue shrink — +;; dead-frame elimination. ;; -;; Local promotion (v0.14.0) homes eligible i32 locals in callee-saved r4..r8. -;; For a LEAF function that is the wrong pool: callee-saved regs must be -;; saved/restored (`push {r4-r8,lr}` / `pop {…,pc}` ~12 cyc of pure overhead), -;; whereas a leaf never calls, so caller-saved r1..r3 (minus params, minus r0 -;; for the return value) are free homes that need NO prologue save. Promoting -;; into caller-saved first lets `shrink_callee_saved_saves` drop the callee-saved -;; push entirely. +;; `compute_local_layout` reserves a frame slot (`sub sp,#N` / `add sp,#N`) for +;; every non-param wasm local it sees. Local promotion (v0.14.0) then homes the +;; eligible i32 locals in registers, so those frame bytes are NEVER accessed: a +;; dead `sub`/`add sp` pair (~2-3 cyc on a small leaf) that also touches SP and +;; thereby blocks `shrink_callee_saved_saves` (which declines on any SP def/use). +;; `elide_dead_frame` (SYNTH_DEAD_FRAME_ELIM=1) removes it when the body provably +;; never touches SP — saving the two instructions and restoring the SP-untouched +;; precondition the shrink pass needs. ;; ;; This fixture: 1 param + 3 promotable i32 locals (each written-before-read, -;; depth-0, >=2 reads), minimal operand-stack temp pressure. Flag-off homes -;; a,b,c -> r4,r5,r6 (push {r4-r6,lr}); flag-on (SYNTH_LEAF_CALLER_SAVED=1) homes -;; them -> r1,r2,r3 (no callee-saved push). Same result either way. +;; depth-0, >=2 reads), minimal operand-stack temp pressure. Promotion homes +;; a,b,c -> r4,r5,r6 and the layout reserves a dead 16-byte frame. Flag-off keeps +;; it (`sub sp,#16` ... `add sp,#16`, 36 B); flag-on elides it (28 B, -8 B = the +;; two 4-byte wide insns), byte-identical otherwise. leaf3(p) = 4*p + 10. +;; +;; NOTE: the push stays `{r4-r8,lr}` either way here — a,b,c are in callee-saved +;; r4,r5,r6 + scratch r7 = 4 saved regs, which `shrink_callee_saved_saves` pads +;; back up to the even-count `{r4-r8,lr}`. Trimming the push needs the locals OUT +;; of callee-saved (caller-saved leaf homing), tracked separately as #390. ;; ;; Generic — neutral values, tied to nothing real. (module diff --git a/scripts/repro/leaf_dead_frame_differential.py b/scripts/repro/leaf_dead_frame_differential.py new file mode 100644 index 0000000..01a67b9 --- /dev/null +++ b/scripts/repro/leaf_dead_frame_differential.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +"""VCR-RA-002 / VCR-ORACLE-001 (#390, #242) — EXECUTION-validate dead-frame elision. + +`compute_local_layout` reserves a frame slot (`sub sp,#N` / `add sp,#N`) for every +non-param wasm local. Local promotion (v0.14.0) then homes the eligible i32 locals +in registers, leaving those frame bytes never accessed — a dead `sub`/`add sp` pair. +`elide_dead_frame` (SYNTH_DEAD_FRAME_ELIM=1) removes it when the body provably never +touches SP. + +This harness runs `leaf3` under unicorn in BOTH the flag-off build (frame present) +and the flag-on build (frame elided) and asserts both equal wasmtime ground truth +across an input sweep. Because the transform deletes SP arithmetic, the danger it +guards against is precisely a SILENT stack imbalance — so executing both builds and +comparing the returned value (and that the function returns cleanly via the popped +LR) is the oracle that matters, not a byte diff alone. + +NON-VACUITY: the run aborts unless the flag-on .text is strictly smaller than +flag-off (the `sub`/`add sp` pair was actually removed) — otherwise nothing was +transformed and the "validation" is theater. + +leaf3(p) = a=p+1; b=a+2; c=b+3; (a+b)+(c+p) = 4*p + 10 (mod 2^32) + +Run (needs wasmtime + unicorn + pyelftools, e.g. a venv): + python scripts/repro/leaf_dead_frame_differential.py + +Exits nonzero on any mismatch or vacuity failure. +""" +import subprocess +import sys + +import wasmtime +from elftools.elf.elffile import ELFFile +from unicorn import UC_ARCH_ARM, UC_MODE_THUMB, Uc, UcError +from unicorn.arm_const import ( + UC_ARM_REG_LR, + UC_ARM_REG_R0, + UC_ARM_REG_R11, + UC_ARM_REG_SP, +) + +WASM = "scripts/repro/leaf_caller_saved.wat" +SYNTH = "./target/debug/synth" +ELF_OFF, ELF_ON = "/tmp/leaf_frame_off.elf", "/tmp/leaf_frame_on.elf" +CODE, LIN, STK, RET = 0x200000, 0x40000, 0x90000, 0x300000 + +# p values: signed edges, i32-wrap (4p+10 overflow), zero. +VECTORS = [0, 1, 7, 100, -1, -3, 2147483647, -2147483648, 1073741823, 12345] + + +def to_s32(x): + x &= 0xFFFFFFFF + return x - 0x100000000 if x & 0x80000000 else x + + +def compile_elf(out, elide): + env = {"PATH": "/usr/bin:/bin"} + if elide: + env["SYNTH_DEAD_FRAME_ELIM"] = "1" + r = subprocess.run( + [SYNTH, "compile", WASM, "-o", out, "--target", "cortex-m4", + "--all-exports", "--relocatable", "--native-pointer-abi"], + capture_output=True, text=True, env={**env}, + ) + if r.returncode != 0: + sys.exit(f"compile failed (elide={elide}): {r.stderr}") + + +def load(elf): + # leaf3 is the only function → .text offset 0; entry == CODE. + text = ELFFile(open(elf, "rb")).get_section_by_name(".text") + code, base = text.data(), text["sh_addr"] + if not code: + sys.exit(f"{elf}: .text is empty") + return code, base, base + + +def run_arm(code, base, fa, p): + mu = Uc(UC_ARCH_ARM, UC_MODE_THUMB) + mu.mem_map(CODE, 0x10000) + mu.mem_map(LIN, 0x20000) + mu.mem_map(STK - 0x8000, 0x10000) + mu.mem_map(RET, 0x1000) + mu.mem_write(CODE, code) + mu.reg_write(UC_ARM_REG_SP, STK) + mu.reg_write(UC_ARM_REG_R11, LIN) + mu.reg_write(UC_ARM_REG_R0, p & 0xFFFFFFFF) + mu.reg_write(UC_ARM_REG_LR, RET | 1) + try: + mu.emu_start((CODE + fa - base) | 1, RET, count=10000) + except UcError as e: + return f"ERR:{e}" + return mu.reg_read(UC_ARM_REG_R0) & 0xFFFFFFFF + + +def main(): + compile_elf(ELF_OFF, elide=False) + compile_elf(ELF_ON, elide=True) + + off = load(ELF_OFF) + on = load(ELF_ON) + + # Non-vacuity: the frame `sub`/`add sp` pair (two 4-byte wide insns) must + # actually be gone — flag-on .text strictly smaller than flag-off. + off_len, on_len = len(off[0]), len(on[0]) + if not on_len < off_len: + sys.exit(f"VACUOUS: flag-on .text ({on_len}B) not smaller than " + f"flag-off ({off_len}B) — no frame was elided") + + engine = wasmtime.Engine() + module = wasmtime.Module(engine, open(WASM, "rb").read()) + + def wt(p): + store = wasmtime.Store(engine) + inst = wasmtime.Instance(store, module, []) + return inst.exports(store)["leaf3"](store, p) & 0xFFFFFFFF + + fails = 0 + for p in VECTORS: + gt = wt(p) + r_off = run_arm(*off, p) + r_on = run_arm(*on, p) + ok = isinstance(r_off, int) and isinstance(r_on, int) and r_off == gt and r_on == gt + fails += 0 if ok else 1 + so = f"0x{r_off:08X}" if isinstance(r_off, int) else r_off + sn = f"0x{r_on:08X}" if isinstance(r_on, int) else r_on + print(f"leaf3({p}) off={so} on={sn} wasmtime=0x{gt:08X}" + f"{'' if ok else ' <-- MISMATCH'}") + + print(f"\n{len(VECTORS) - fails}/{len(VECTORS)} match (off==on==wasmtime); " + f"frame elided: {off_len}B -> {on_len}B (-{off_len - on_len}B)") + print("ORACLE: PASS" if fails == 0 else f"ORACLE: FAIL ({fails})") + sys.exit(1 if fails else 0) + + +if __name__ == "__main__": + main()