diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aa0a949..9097feb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -303,3 +303,37 @@ jobs: env: SYNTH: ./target/debug/synth run: python scripts/repro/shift_fold_riscv_differential.py + + rv32-const-addr-fold-oracle: + name: rv32 const-address-fold execution oracle + # VCR-ORACLE-001 (#242, #472 step 2): EXECUTE the RV32 const-address-fold lever + # under unicorn (UC_ARCH_RISCV) in BOTH flag states and diff the resulting + # linear MEMORY vs wasmtime. Ships flag-off (SYNTH_RV_ADDR_FOLD) awaiting the + # on-silicon flip, so this continuously validates the flag-on path (folding a + # constant address into the access immediate off s11) against regression. + # Isolated job: emulation deps pip-installed here ONLY. + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v7 + - uses: dtolnay/rust-toolchain@stable + - name: Cache Cargo dependencies + uses: actions/cache@v5 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + - name: Build synth + run: cargo build -p synth-cli + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install emulation deps + run: pip install wasmtime unicorn pyelftools + - name: Run RV32 const-address-fold execution oracle + env: + SYNTH: ./target/debug/synth + run: python scripts/repro/const_addr_fold_riscv_differential.py diff --git a/crates/synth-backend-riscv/src/selector.rs b/crates/synth-backend-riscv/src/selector.rs index 390f5fa..58e079d 100644 --- a/crates/synth-backend-riscv/src/selector.rs +++ b/crates/synth-backend-riscv/src/selector.rs @@ -215,6 +215,13 @@ pub fn select_with_result_types( if std::env::var_os("SYNTH_RV_SHIFT_FOLD").is_some() { fold_const_shift(&mut ctx.out); } + // VCR-RA RV32 lever (#472 step 2): fold a constant memory address into the + // access immediate off s11, dropping the `addi addr,zero,ADDR; add tmp,s11,addr` + // pair. Flag-off by default (frozen-safe, like the shift fold); the on-target + // cycle win is validated under the RV32 differential before the default-on flip. + if std::env::var_os("SYNTH_RV_ADDR_FOLD").is_some() { + fold_const_addr(&mut ctx.out); + } let local_frame_bytes = ctx.local_frame_bytes; // #220: the temp pool includes callee-saved s-registers (s1..s6), but the // RV psABI requires a function to preserve s0..s11. Wrap the body in a @@ -459,6 +466,147 @@ fn fold_const_shift(out: &mut Vec) -> usize { folds } +/// VCR-RA RV32 lever (#472 step 2, epic #242): fold a CONSTANT memory address into +/// the access immediate off the linear-memory base `s11`, the RISC-V analogue of +/// the ARM base-CSE address half (#468). A `i32.load/store (i32.const ADDR) …` +/// lowers as `addi a,zero,ADDR; add tmp,s11,a; lw/sw _,off(tmp)`; when `ADDR+off` +/// fits the signed-12-bit `lw/sw` immediate, that collapses to a single +/// `lw/sw _,(ADDR+off)(s11)`, dropping BOTH the `add tmp,s11,a` and the +/// `addi a,zero,ADDR` (2 instructions per constant-address access). +/// +/// Soundness: +/// * `ADDR+off` is range-checked as a SUM against [-2048, 2047] (each term is +/// already ≤12 bits, so two in-range values can sum out of range); +/// * the base of the `add` must be `s11` and its address operand `a` must be a +/// `addi a,zero,ADDR` (a single-`addi` small constant; a `lui+addi` large +/// address is out of v1 scope and stays the `add` form); +/// * the fold is a 3→1 rewrite, so BOTH dropped temps must be dead: `tmp` (the +/// add result) read only by the access, and `a` (the address constant) read +/// only by the `add` — verified with [`rv_reg_dead_after`] plus a check that +/// `a` is untouched between its def and the `add`. +/// +/// Only the contiguous `add tmp,s11,a` immediately followed by its access is +/// matched (the no-bounds-check lowering shape); a bounds check between them reads +/// `a`, which disqualifies the fold. Returns the fold count. +fn fold_const_addr(out: &mut Vec) -> usize { + use RiscVOp::*; + let n = out.len(); + let mut drop = vec![false; n]; + let mut folds = 0usize; + + for i in 0..n { + // [i] must be `add tmp, s11, a`. + let (tmp, a) = match &out[i] { + Add { rd, rs1, rs2 } if *rs1 == Reg::S11 => (*rd, *rs2), + _ => continue, + }; + if i + 1 >= n { + continue; + } + // The access at i+1 must use `tmp` as its base register; capture its imm. + let off = match &out[i + 1] { + Lw { rs1, imm, .. } + | Lh { rs1, imm, .. } + | Lhu { rs1, imm, .. } + | Lb { rs1, imm, .. } + | Lbu { rs1, imm, .. } + | Sw { rs1, imm, .. } + | Sh { rs1, imm, .. } + | Sb { rs1, imm, .. } + if *rs1 == tmp => + { + *imm + } + _ => continue, + }; + // `a` must be defined by the nearest prior `addi a, zero, ADDR`. + let addr_def = (0..i).rev().find(|&j| op_dest(&out[j]) == Some(a)); + let (def_idx, addr_const) = match addr_def { + Some(j) => match &out[j] { + Addi { rs1, imm, .. } if *rs1 == Reg::ZERO => (j, *imm), + _ => continue, + }, + None => continue, + }; + // The folded immediate is the SUM; it must fit the signed 12-bit window. + let total = addr_const.wrapping_add(off); + if !(-2048..=2047).contains(&total) { + continue; + } + // `tmp` must be dead after the access (its only consumer). + if !rv_reg_dead_after(tmp, &out[i + 2..]) { + continue; + } + // `a` must be read ONLY by the `add` at i: untouched between its def and + // the add, and dead after. + let a_read_between = (def_idx + 1..i).any(|k| match op_reads(&out[k]) { + Some(rs) => rs.contains(&a), + None => true, // unmodeled op between ⇒ cannot prove `a` unread + }); + if a_read_between || !rv_reg_dead_after(a, &out[i + 1..]) { + continue; + } + // Rewrite the access to address off s11 with the folded immediate. + out[i + 1] = match out[i + 1].clone() { + Lw { rd, .. } => Lw { + rd, + rs1: Reg::S11, + imm: total, + }, + Lh { rd, .. } => Lh { + rd, + rs1: Reg::S11, + imm: total, + }, + Lhu { rd, .. } => Lhu { + rd, + rs1: Reg::S11, + imm: total, + }, + Lb { rd, .. } => Lb { + rd, + rs1: Reg::S11, + imm: total, + }, + Lbu { rd, .. } => Lbu { + rd, + rs1: Reg::S11, + imm: total, + }, + Sw { rs2, .. } => Sw { + rs1: Reg::S11, + rs2, + imm: total, + }, + Sh { rs2, .. } => Sh { + rs1: Reg::S11, + rs2, + imm: total, + }, + Sb { rs2, .. } => Sb { + rs1: Reg::S11, + rs2, + imm: total, + }, + other => other, // unreachable (matched above) + }; + drop[i] = true; + drop[def_idx] = true; + folds += 1; + } + + if folds == 0 { + return 0; + } + let mut idx = 0usize; + out.retain(|_| { + let keep = !drop[idx]; + idx += 1; + keep + }); + folds +} + /// True for callee-saved registers the allocator may hand out as temps: /// `s1` (x9) and `s2..s10` (x18..x26). Excludes the runtime-reserved `s0` /// (x8, frame pointer) and `s11` (x27, `__linear_memory_base`) — which the @@ -6428,6 +6576,187 @@ mod tests { ); } + // ---- #472 step 2: const-address fold ---- + + /// Folds `addi a,zero,ADDR; add tmp,s11,a; sw v,off(tmp)` to `sw v,(ADDR)(s11)`, + /// dropping the address `addi` and the `add`. Driven on the baseline fixture + /// output (what the CLI flag wires in). + #[test] + fn fold_const_addr_folds_store_off_s11_472() { + // (i32.store (i32.const 8) (i32.const 33)) + let mut out = s( + &[ + WasmOp::I32Const(8), + WasmOp::I32Const(33), + WasmOp::I32Store { + offset: 0, + align: 2, + }, + WasmOp::End, + ], + 0, + ); + let before_adds = count(&out, |op| matches!(op, RiscVOp::Add { rs1: Reg::S11, .. })); + assert_eq!(before_adds, 1, "baseline computes base+addr: {out:?}"); + let folds = fold_const_addr(&mut out); + assert_eq!(folds, 1, "the const-addr store folds: {out:?}"); + assert_eq!( + count(&out, |op| matches!( + op, + RiscVOp::Sw { + rs1: Reg::S11, + imm: 8, + .. + } + )), + 1, + "store folded to `sw v, 8(s11)`: {out:?}" + ); + assert_eq!( + count(&out, |op| matches!(op, RiscVOp::Add { rs1: Reg::S11, .. })), + 0, + "the `add _,s11,_` is gone: {out:?}" + ); + } + + /// The folded immediate is `ADDR + access-offset`; both terms must sum within + /// the signed 12-bit window. `i32.store offset=4 (i32.const 8)` → `sw v,12(s11)`. + #[test] + fn fold_const_addr_adds_access_offset_472() { + let mut out = s( + &[ + WasmOp::I32Const(8), + WasmOp::I32Const(33), + WasmOp::I32Store { + offset: 4, + align: 2, + }, + WasmOp::End, + ], + 0, + ); + assert_eq!(fold_const_addr(&mut out), 1, "{out:?}"); + assert_eq!( + count(&out, |op| matches!( + op, + RiscVOp::Sw { + rs1: Reg::S11, + imm: 12, + .. + } + )), + 1, + "ADDR(8)+off(4) = 12: {out:?}" + ); + } + + /// A load folds the same way: `lw dst, (ADDR)(s11)`. + #[test] + fn fold_const_addr_folds_load_472() { + let mut out = s( + &[ + WasmOp::I32Const(16), + WasmOp::I32Load { + offset: 0, + align: 2, + }, + WasmOp::Drop, + WasmOp::End, + ], + 0, + ); + assert_eq!(fold_const_addr(&mut out), 1, "{out:?}"); + assert_eq!( + count(&out, |op| matches!( + op, + RiscVOp::Lw { + rs1: Reg::S11, + imm: 16, + .. + } + )), + 1, + "load folded to `lw dst, 16(s11)`: {out:?}" + ); + } + + /// Range guard: when `ADDR + off` exceeds the signed 12-bit window the fold + /// must decline (the access stays `lw/sw _,off(tmp)` off the computed base). + #[test] + fn fold_const_addr_declines_when_sum_out_of_12bit_472() { + // ADDR = 2044, off = 8 → 2052 > 2047 → out of range. + let mut out = vec![ + RiscVOp::Addi { + rd: Reg::T0, + rs1: Reg::ZERO, + imm: 2044, + }, + RiscVOp::Add { + rd: Reg::T1, + rs1: Reg::S11, + rs2: Reg::T0, + }, + RiscVOp::Sw { + rs1: Reg::T1, + rs2: Reg::T2, + imm: 8, + }, + RiscVOp::Jalr { + rd: Reg::ZERO, + rs1: Reg::RA, + imm: 0, + }, + ]; + assert_eq!( + fold_const_addr(&mut out), + 0, + "out-of-range sum: no fold: {out:?}" + ); + assert_eq!( + count(&out, |op| matches!(op, RiscVOp::Add { rs1: Reg::S11, .. })), + 1 + ); + } + + /// Soundness: when the address temp `a` is read by another op (not single-use), + /// the fold must NOT drop its `addi` — decline. + #[test] + fn fold_const_addr_declines_when_addr_reused_472() { + let mut out = vec![ + RiscVOp::Addi { + rd: Reg::T0, + rs1: Reg::ZERO, + imm: 16, + }, + RiscVOp::Add { + rd: Reg::T1, + rs1: Reg::S11, + rs2: Reg::T0, + }, + RiscVOp::Sw { + rs1: Reg::T1, + rs2: Reg::T2, + imm: 0, + }, + // a second use of the address constant t0 ⇒ not single-use + RiscVOp::Add { + rd: Reg::T3, + rs1: Reg::A0, + rs2: Reg::T0, + }, + RiscVOp::Jalr { + rd: Reg::ZERO, + rs1: Reg::RA, + imm: 0, + }, + ]; + assert_eq!( + fold_const_addr(&mut out), + 0, + "addr reused ⇒ no fold: {out:?}" + ); + } + /// #472 local-promotion baseline: a non-param i32 local is frame-spilled /// (`sw _,off(sp)` / `lw`), not register-homed. The lever will keep eligible /// leaf locals in s-registers (the #390 analogue, carrying the #474 fallback). diff --git a/scripts/repro/const_addr_fold_riscv_differential.py b/scripts/repro/const_addr_fold_riscv_differential.py new file mode 100644 index 0000000..8957bc9 --- /dev/null +++ b/scripts/repro/const_addr_fold_riscv_differential.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +"""VCR-RA RV32 lever (#472 step 2, epic #242) — EXECUTION-validate const-addr-fold. + +`SYNTH_RV_ADDR_FOLD=1` folds a constant memory address into the access immediate +off s11: `addi a,zero,ADDR; add tmp,s11,a; sw v,off(tmp)` -> `sw v,(ADDR+off)(s11)`, +dropping two instructions per constant-address access. The RV32 path has no cargo +byte-gate, so EXECUTION is the oracle: this harness compiles the field-initializer +fixture (7 constant-address stores) in BOTH flag states, runs `init_fields` under +unicorn (UC_ARCH_RISCV) with s11 = the linear-memory base, and asserts the +resulting MEMORY is bit-identical to wasmtime ground truth (the fixture writes +memory and returns nothing, so memory is the observable). + +NON-VACUITY: the flag-on `.text` must be strictly smaller (each fold drops the +`addi`+`add` pair), else the fold did not fire. + +Run (needs wasmtime + unicorn + pyelftools): + SYNTH=./target/debug/synth python scripts/repro/const_addr_fold_riscv_differential.py +Exits nonzero on any mismatch or vacuity failure. +""" +import os +import subprocess +import sys + +import wasmtime +from elftools.elf.elffile import ELFFile +from unicorn import UC_ARCH_RISCV, UC_MODE_RISCV32, Uc, UcError +from unicorn.riscv_const import UC_RISCV_REG_RA, UC_RISCV_REG_S11, UC_RISCV_REG_SP + +WAT = "scripts/repro/redundant_base_materialization.wat" +SYNTH = os.environ.get("SYNTH", "./target/release/synth") +CODE, LIN, STK, RET = 0x100000, 0x40000, 0x90000, 0x200000 +# Fields written by init_fields: (wasm_addr, width_bytes). +FIELDS = [(0, 4), (4, 4), (8, 4), (12, 2), (14, 2), (16, 1), (17, 1)] + + +def compile_elf(out, fold): + env = {"PATH": "/usr/bin:/bin"} + if fold: + env["SYNTH_RV_ADDR_FOLD"] = "1" + r = subprocess.run( + [SYNTH, "compile", WAT, "-o", out, "-b", "riscv", "-t", "rv32imac", + "--all-exports", "--relocatable"], + capture_output=True, text=True, env=env, + ) + if r.returncode != 0: + sys.exit(f"compile failed (fold={fold}): {r.stderr}") + + +def load(elf): + f = ELFFile(open(elf, "rb")) + text = f.get_section_by_name(".text") + code, base = text.data(), text["sh_addr"] + syms = {} + for s in f.iter_sections(): + if s.header.sh_type == "SHT_SYMTAB": + for sym in s.iter_symbols(): + if sym.name: + syms[sym.name] = sym["st_value"] - base + return code, base, syms + + +def run_rv(code, base, fa): + mu = Uc(UC_ARCH_RISCV, UC_MODE_RISCV32) + mu.mem_map(CODE, 0x20000) + mu.mem_map(LIN, 0x20000) + mu.mem_map(STK - 0x8000, 0x10000) + mu.mem_map(RET, 0x1000) + mu.mem_write(CODE, code) + mu.reg_write(UC_RISCV_REG_SP, STK) + mu.reg_write(UC_RISCV_REG_S11, LIN) # s11 = __linear_memory_base + mu.reg_write(UC_RISCV_REG_RA, RET) + try: + mu.emu_start(CODE + (fa & ~1) - base, RET, count=4000) + except UcError as e: + return f"ERR:{e}" + out = {} + for (off, w) in FIELDS: + out[off] = int.from_bytes(mu.mem_read(LIN + off, w), "little") + return out + + +def wasm_mem(): + engine = wasmtime.Engine() + module = wasmtime.Module(engine, open(WAT, "rb").read()) + store = wasmtime.Store(engine) + inst = wasmtime.Instance(store, module, []) + inst.exports(store)["init_fields"](store) + data = inst.exports(store)["memory"].read(store, 0, 64) + return {off: int.from_bytes(data[off:off + w], "little") for (off, w) in FIELDS} + + +def main(): + if not os.path.exists(SYNTH): + sys.exit(f"{SYNTH} not found — build synth first") + off_elf, on_elf = "/tmp/caf_off.o", "/tmp/caf_on.o" + compile_elf(off_elf, False) + compile_elf(on_elf, True) + off_code, off_base, off_syms = load(off_elf) + on_code, on_base, on_syms = load(on_elf) + + gt = wasm_mem() + r_off = run_rv(off_code, off_base, off_syms["init_fields"]) + r_on = run_rv(on_code, on_base, on_syms["init_fields"]) + fails = 0 + ok = isinstance(r_off, dict) and isinstance(r_on, dict) and r_off == gt and r_on == gt + fails += 0 if ok else 1 + print(f"init_fields: off={'ok' if isinstance(r_off, dict) else r_off} " + f"on={'ok' if isinstance(r_on, dict) else r_on} vs wasmtime " + f"{'MATCH' if ok else 'MISMATCH'}") + if not ok: + print(f" off={r_off}\n on ={r_on}\n wt ={gt}") + + off_len, on_len = len(off_code), len(on_code) + if not on_len < off_len: + print(f"VACUOUS: flag-on .text ({on_len}B) not < flag-off ({off_len}B) " + "— const-addr fold did not fire") + fails += 1 + else: + print(f"\n.text {off_len}B -> {on_len}B (-{off_len - on_len}B): " + f"~{(off_len - on_len) // 4} instruction(s) folded across 7 const-addr stores") + + print("ORACLE: PASS" if fails == 0 else f"ORACLE: FAIL ({fails})") + sys.exit(1 if fails else 0) + + +if __name__ == "__main__": + main()