diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index 863ae39..4e59fe6 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -46,6 +46,147 @@ fn fold_mem_offset(base: u32, offset: u32) -> (u32, i32) { } } +/// The linear-memory base the optimized (absolute) path materializes. +const BASE_CSE_LINMEM_BASE: u32 = 0x2000_0100; +/// VCR-RA lever 3 base register: R11 is OUTSIDE the `reallocate_function` pool +/// (R0–R8), so the range-reallocator identity-preserves it — the hoisted base +/// survives across every straight-line segment untouched, letting us materialize +/// it ONCE at entry rather than per-segment. R11 is also outside local +/// promotion's pool (R4–R8) and is not the encoder scratch (R12). The optimized +/// path's only other uses of R11 — synthetic-local-255 (`Select` nesting) and the +/// `(R10,R11)` i64 pair — are excluded by the planner's disqualification set. +const BASE_CSE_REG: crate::rules::Reg = crate::rules::Reg::R11; + +/// VCR-RA lever 3 (#468, epic #242): plan for hoisting the loop-invariant +/// linear-memory base out of a run of constant-address memory accesses. +#[derive(Debug, Default, PartialEq)] +struct BaseCsePlan { + /// Address vreg → folded immediate (`const_addr + access_offset`, ≤ imm12). + /// The access is rewritten to `[R11, #imm]`, dropping its per-access + /// `movw/movt` base + `add`. + fold: std::collections::HashMap, + /// Const vregs whose ONLY use is a folded address — their materialization is + /// dropped (this is what makes the reserved base a net register-pressure win + /// rather than a wash). + skip_const: std::collections::HashSet, +} + +/// Source (read) vregs of an opcode, or `None` if the opcode is outside the +/// base-CSE-safe set — any global/memory-size/select/call/i64/unknown op needs a +/// high register (R9 globals / R10 memsize / R11 select-temp+i64-pair) or a +/// behaviour v1 does not model, so its presence declines base-CSE for the whole +/// function (returns `None` → planner bails → byte-identical per-access path). +/// `_ => None` is the safety backstop: an unenumerated opcode disqualifies rather +/// than risk a missed clobber of the reserved R11 on this un-byte-gated path. +fn base_cse_sources(op: &Opcode) -> Option> { + use Opcode::*; + let two = |a: &OptReg, b: &OptReg| Some(vec![a.0, b.0]); + let one = |a: &OptReg| Some(vec![a.0]); + match op { + // i32 binops: read src1, src2. + Add { src1, src2, .. } + | Sub { src1, src2, .. } + | Mul { src1, src2, .. } + | DivS { src1, src2, .. } + | DivU { src1, src2, .. } + | RemS { src1, src2, .. } + | RemU { src1, src2, .. } + | And { src1, src2, .. } + | Or { src1, src2, .. } + | Xor { src1, src2, .. } + | Shl { src1, src2, .. } + | ShrS { src1, src2, .. } + | ShrU { src1, src2, .. } + | Rotl { src1, src2, .. } + | Rotr { src1, src2, .. } + | Eq { src1, src2, .. } + | Ne { src1, src2, .. } + | LtS { src1, src2, .. } + | LtU { src1, src2, .. } + | LeS { src1, src2, .. } + | LeU { src1, src2, .. } + | GtS { src1, src2, .. } + | GtU { src1, src2, .. } + | GeS { src1, src2, .. } + | GeU { src1, src2, .. } => two(src1, src2), + // i32 unops. + Clz { src, .. } + | Ctz { src, .. } + | Popcnt { src, .. } + | Extend8S { src, .. } + | Extend16S { src, .. } + | Eqz { src, .. } + | Copy { src, .. } + | Store { src, .. } + | TeeStore { src, .. } => one(src), + // Memory accesses: the address vreg is a read (this is the use that the + // planner pairs with a const def); stores additionally read `src`. + MemStore { src, addr, .. } | MemStoreSubword { src, addr, .. } => two(src, addr), + MemLoad { addr, .. } | MemLoadSubword { addr, .. } => one(addr), + Return { value } => Some(value.iter().map(|r| r.0).collect()), + // No register reads. `Label` is allowed: a function body carries a trailing + // structural end-label even with no real branching, and a label with + // nothing branching to it does not split control flow. `Branch` / + // `CondBranch` (below) are what indicate a genuine multi-block function. + Const { .. } | Load { .. } | Nop | Label { .. } => Some(vec![]), + // Everything else → disqualify the function. This INCLUDES `Branch` / + // `CondBranch`: v1 confines base-CSE to functions with no control-flow + // divergence — #468's straight-line field-initializer target — keeping it + // clear of the optimized path's (separately-tracked) multi-block lowering. + // R11 is realloc-immune (out of the R0–R8 pool) so a hoisted base WOULD + // survive branches; restricting to single-block is the conservative choice. + // Also covers Select / Global* / MemorySize-Grow / Call / all i64 (high-reg + // users) and any unenumerated opcode (safety backstop). + _ => None, + } +} + +/// Decide whether base-CSE activates for this function and, if so, which const +/// addresses fold. Returns `None` (decline → unchanged per-access codegen) unless +/// ≥2 constant-address accesses fold and every opcode is base-CSE-safe. +fn plan_base_cse(instructions: &[Instruction]) -> Option { + use std::collections::HashMap; + let mut const_val: HashMap = HashMap::new(); + let mut uses: HashMap = HashMap::new(); + // (addr vreg, static access offset) for every linear-memory access. + let mut accesses: Vec<(u32, u32)> = Vec::new(); + for inst in instructions { + match &inst.opcode { + Opcode::Const { dest, value } => { + const_val.insert(dest.0, *value); + } + Opcode::MemStore { addr, offset, .. } + | Opcode::MemLoad { addr, offset, .. } + | Opcode::MemStoreSubword { addr, offset, .. } + | Opcode::MemLoadSubword { addr, offset, .. } => { + accesses.push((addr.0, *offset)); + } + _ => {} + } + // A single unenumerated/disqualifying opcode declines the whole function. + let srcs = base_cse_sources(&inst.opcode)?; + for v in srcs { + *uses.entry(v).or_insert(0) += 1; + } + } + let mut plan = BaseCsePlan::default(); + for (addr_vreg, off) in accesses { + // Foldable iff the address is a compile-time constant whose ONLY use is + // this access, and base+addr+offset stays in the imm12 window so the + // access immediate `[R11, #imm]` encodes directly. + if let Some(&aval) = const_val.get(&addr_vreg) + && uses.get(&addr_vreg) == Some(&1) + { + let folded = (aval as i64) + (off as i64); + if (0..=0xFFF).contains(&folded) { + plan.fold.insert(addr_vreg, folded as i32); + plan.skip_const.insert(addr_vreg); + } + } + } + (plan.fold.len() >= 2).then_some(plan) +} + /// Optimization configuration #[derive(Debug, Clone)] pub struct OptimizationConfig { @@ -2315,7 +2456,28 @@ impl OptimizerBridge { // AAPCS arguments that must NOT be clobbered by i64 op handlers — at least // until the user's WASM has done a `local.get` of each. Using Vec because // `Reg` does not derive Hash (matches `instruction_selector::alloc_consecutive_pair`). - let param_reserved_regs: Vec = param_regs[..num_params.min(4)].to_vec(); + let mut param_reserved_regs: Vec = param_regs[..num_params.min(4)].to_vec(); + + // VCR-RA lever 3 base-CSE (#468, epic #242): if the function is a run of + // constant-address memory accesses, reserve R11 as a persistent base + // register (excluded from every allocator via `param_reserved_regs`), + // materialize the linear-memory base into it ONCE at entry, and fold each + // const address into the access immediate (`str V,[R11,#ADDR]`) — dropping + // the per-access `movw/movt` base re-materialization (#468's complaint) + // AND the now-dead address materialization (the pressure relief that keeps + // the reserved base a net win). R11 is realloc-immune (outside the R0–R8 + // pool), so the single entry materialization survives every segment. + // Opt-in (`SYNTH_BASE_CSE=1`) → off ⇒ byte-identical. The optimized path + // is the ONLY caller of `ir_to_arm`, so this never reaches the relocatable + // lowering (which already pins the base in `fp`). + let base_cse: Option = if std::env::var("SYNTH_BASE_CSE").is_ok() { + plan_base_cse(instructions) + } else { + None + }; + if base_cse.is_some() { + param_reserved_regs.push(BASE_CSE_REG); + } // Track which ARM register currently holds each local variable // This avoids stack spills for simple cases @@ -2617,6 +2779,22 @@ impl OptimizerBridge { } } + // VCR-RA lever 3 base-CSE: materialize the linear-memory base into the + // reserved R11 ONCE, before any access. Placed before the second pass so + // it precedes every folded `[R11,#ADDR]` and so `ir_to_arm_idx` (recorded + // during the loop) accounts for these two leading instructions. R11 is + // realloc-immune, so this single def reaches every later use unremapped. + if base_cse.is_some() { + arm_instrs.push(ArmOp::Movw { + rd: BASE_CSE_REG, + imm16: (BASE_CSE_LINMEM_BASE & 0xFFFF) as u16, + }); + arm_instrs.push(ArmOp::Movt { + rd: BASE_CSE_REG, + imm16: ((BASE_CSE_LINMEM_BASE >> 16) & 0xFFFF) as u16, + }); + } + // Second pass: generate ARM instructions for inst in instructions { match &inst.opcode { @@ -2698,17 +2876,29 @@ impl OptimizerBridge { // Constant: mov immediate to register Opcode::Const { dest, value } => { + // VCR-RA lever 3 base-CSE: this const is a folded address — its + // ONLY use is a `[R11,#ADDR]` access (planner-verified single + // use), so do not materialize it at all. Dropping it is the + // register-pressure relief that makes the reserved base a win. + if let Some(plan) = &base_cse + && plan.skip_const.contains(&dest.0) + { + continue; + } // Allocate a register for this constant let rd = if let Some(&r) = vreg_to_arm.get(&dest.0) { r } else { // Find next available temp register - // Exclude live vregs (not dead) and local_to_reg to avoid clobbering + // Exclude live vregs (not dead) and local_to_reg to avoid clobbering. + // Base-CSE reserves R11 (in `param_reserved_regs`); fold it in + // so the const pool never hands out the live base register. let used: Vec<_> = vreg_to_arm .iter() .filter(|(k, _)| !dead_vregs.contains(k)) .map(|(_, v)| *v) .chain(local_to_reg.values().copied()) + .chain(param_reserved_regs.iter().copied()) .collect(); // Expanded temp register pool: R4-R11 (callee-saved) plus R3 // Note: R0-R2 are reserved for params/return, R12 is IP, R13 is SP, R14 is LR, R15 is PC @@ -4636,78 +4826,111 @@ impl OptimizerBridge { // argument on every `i32.load`. Use the scratch helper so // the destination is picked from the callee-saved bank. Opcode::MemLoad { dest, addr, offset } => { - let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs)?; - let rd = alloc_i32_scratch( - &vreg_to_arm, - &local_to_reg, - ¶m_reserved_regs, - &[r_addr], - ); - vreg_to_arm.insert(dest.0, rd); + // VCR-RA lever 3 base-CSE: const address → load directly off the + // once-materialized base in R11 (no per-access base / add / addr). + if let Some(plan) = &base_cse + && let Some(&folded) = plan.fold.get(&addr.0) + { + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[], + ); + vreg_to_arm.insert(dest.0, rd); + arm_instrs.push(ArmOp::Ldr { + rd, + addr: crate::rules::MemAddr::imm(BASE_CSE_REG, folded), + }); + last_result_vreg = Some(dest.0); + } else { + let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs)?; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[r_addr], + ); + vreg_to_arm.insert(dest.0, rd); - // Linear memory base 0x20000100 (SRAM, above stack area). - // #382: fold a large static offset (> imm12) into the - // compile-time-constant base so the access immediate is 0. - let (base, mem_off) = fold_mem_offset(0x20000100, *offset); - let base_lo = (base & 0xFFFF) as u16; - let base_hi = ((base >> 16) & 0xFFFF) as u16; + // Linear memory base 0x20000100 (SRAM, above stack area). + // #382: fold a large static offset (> imm12) into the + // compile-time-constant base so the access immediate is 0. + let (base, mem_off) = fold_mem_offset(0x20000100, *offset); + let base_lo = (base & 0xFFFF) as u16; + let base_hi = ((base >> 16) & 0xFFFF) as u16; - // Load base address into R12 (scratch register) - arm_instrs.push(ArmOp::Movw { - rd: Reg::R12, - imm16: base_lo, - }); - arm_instrs.push(ArmOp::Movt { - rd: Reg::R12, - imm16: base_hi, - }); - // Add WASM address offset - arm_instrs.push(ArmOp::Add { - rd: Reg::R12, - rn: Reg::R12, - op2: Operand2::Reg(r_addr), - }); - // Load from [base + wasm_addr + static_offset] - arm_instrs.push(ArmOp::Ldr { - rd, - addr: crate::rules::MemAddr::imm(Reg::R12, mem_off), - }); - last_result_vreg = Some(dest.0); + // Load base address into R12 (scratch register) + arm_instrs.push(ArmOp::Movw { + rd: Reg::R12, + imm16: base_lo, + }); + arm_instrs.push(ArmOp::Movt { + rd: Reg::R12, + imm16: base_hi, + }); + // Add WASM address offset + arm_instrs.push(ArmOp::Add { + rd: Reg::R12, + rn: Reg::R12, + op2: Operand2::Reg(r_addr), + }); + // Load from [base + wasm_addr + static_offset] + arm_instrs.push(ArmOp::Ldr { + rd, + addr: crate::rules::MemAddr::imm(Reg::R12, mem_off), + }); + last_result_vreg = Some(dest.0); + } } // MemStore: store 32-bit value to linear memory // Generates: MOVW R12, #base_lo; MOVT R12, #base_hi; ADD R12, R12, Raddr; STR Rsrc, [R12, #offset] Opcode::MemStore { src, addr, offset } => { - let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs)?; - let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs)?; + // VCR-RA lever 3 base-CSE: the address is a folded compile-time + // constant — store directly off the once-materialized base in + // R11, dropping the per-access `movw/movt` + `add` and the + // address materialization (skipped at its `Const`). + if let Some(plan) = &base_cse + && let Some(&folded) = plan.fold.get(&addr.0) + { + let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs)?; + arm_instrs.push(ArmOp::Str { + rd: r_src, + addr: crate::rules::MemAddr::imm(BASE_CSE_REG, folded), + }); + } else { + let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs)?; + let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs)?; - // Linear memory base 0x20000100 (SRAM, above stack area). - // #382: fold a large static offset (> imm12) into the - // compile-time-constant base so the access immediate is 0. - let (base, mem_off) = fold_mem_offset(0x20000100, *offset); - let base_lo = (base & 0xFFFF) as u16; - let base_hi = ((base >> 16) & 0xFFFF) as u16; + // Linear memory base 0x20000100 (SRAM, above stack area). + // #382: fold a large static offset (> imm12) into the + // compile-time-constant base so the access immediate is 0. + let (base, mem_off) = fold_mem_offset(0x20000100, *offset); + let base_lo = (base & 0xFFFF) as u16; + let base_hi = ((base >> 16) & 0xFFFF) as u16; - // Load base address into R12 (scratch register) - arm_instrs.push(ArmOp::Movw { - rd: Reg::R12, - imm16: base_lo, - }); - arm_instrs.push(ArmOp::Movt { - rd: Reg::R12, - imm16: base_hi, - }); - // Add WASM address offset - arm_instrs.push(ArmOp::Add { - rd: Reg::R12, - rn: Reg::R12, - op2: Operand2::Reg(r_addr), - }); - // Store to [base + wasm_addr + static_offset] - arm_instrs.push(ArmOp::Str { - rd: r_src, - addr: crate::rules::MemAddr::imm(Reg::R12, mem_off), - }); + // Load base address into R12 (scratch register) + arm_instrs.push(ArmOp::Movw { + rd: Reg::R12, + imm16: base_lo, + }); + arm_instrs.push(ArmOp::Movt { + rd: Reg::R12, + imm16: base_hi, + }); + // Add WASM address offset + arm_instrs.push(ArmOp::Add { + rd: Reg::R12, + rn: Reg::R12, + op2: Operand2::Reg(r_addr), + }); + // Store to [base + wasm_addr + static_offset] + arm_instrs.push(ArmOp::Str { + rd: r_src, + addr: crate::rules::MemAddr::imm(Reg::R12, mem_off), + }); + } // MemStore does not produce a value } @@ -4726,34 +4949,48 @@ impl OptimizerBridge { width, signed, } => { - let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs)?; - let rd = alloc_i32_scratch( - &vreg_to_arm, - &local_to_reg, - ¶m_reserved_regs, - &[r_addr], - ); - vreg_to_arm.insert(dest.0, rd); + // VCR-RA lever 3 base-CSE: const address → subword load off R11. + let (rd, addr_mem) = if let Some(plan) = &base_cse + && let Some(&folded) = plan.fold.get(&addr.0) + { + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[], + ); + vreg_to_arm.insert(dest.0, rd); + (rd, crate::rules::MemAddr::imm(BASE_CSE_REG, folded)) + } else { + let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs)?; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[r_addr], + ); + vreg_to_arm.insert(dest.0, rd); - // #382: fold a large static offset (> imm12) into the - // compile-time-constant base so the access immediate is 0. - let (base, mem_off) = fold_mem_offset(0x20000100, *offset); - let base_lo = (base & 0xFFFF) as u16; - let base_hi = ((base >> 16) & 0xFFFF) as u16; - arm_instrs.push(ArmOp::Movw { - rd: Reg::R12, - imm16: base_lo, - }); - arm_instrs.push(ArmOp::Movt { - rd: Reg::R12, - imm16: base_hi, - }); - arm_instrs.push(ArmOp::Add { - rd: Reg::R12, - rn: Reg::R12, - op2: Operand2::Reg(r_addr), - }); - let addr_mem = crate::rules::MemAddr::imm(Reg::R12, mem_off); + // #382: fold a large static offset (> imm12) into the + // compile-time-constant base so the access immediate is 0. + let (base, mem_off) = fold_mem_offset(0x20000100, *offset); + let base_lo = (base & 0xFFFF) as u16; + let base_hi = ((base >> 16) & 0xFFFF) as u16; + arm_instrs.push(ArmOp::Movw { + rd: Reg::R12, + imm16: base_lo, + }); + arm_instrs.push(ArmOp::Movt { + rd: Reg::R12, + imm16: base_hi, + }); + arm_instrs.push(ArmOp::Add { + rd: Reg::R12, + rn: Reg::R12, + op2: Operand2::Reg(r_addr), + }); + (rd, crate::rules::MemAddr::imm(Reg::R12, mem_off)) + }; let sub_op = match (*width, *signed) { (1, false) => ArmOp::Ldrb { rd, addr: addr_mem }, (1, true) => ArmOp::Ldrsb { rd, addr: addr_mem }, @@ -4777,28 +5014,36 @@ impl OptimizerBridge { offset, width, } => { - let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs)?; - let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs)?; - - // #382: fold a large static offset (> imm12) into the - // compile-time-constant base so the access immediate is 0. - let (base, mem_off) = fold_mem_offset(0x20000100, *offset); - let base_lo = (base & 0xFFFF) as u16; - let base_hi = ((base >> 16) & 0xFFFF) as u16; - arm_instrs.push(ArmOp::Movw { - rd: Reg::R12, - imm16: base_lo, - }); - arm_instrs.push(ArmOp::Movt { - rd: Reg::R12, - imm16: base_hi, - }); - arm_instrs.push(ArmOp::Add { - rd: Reg::R12, - rn: Reg::R12, - op2: Operand2::Reg(r_addr), - }); - let addr_mem = crate::rules::MemAddr::imm(Reg::R12, mem_off); + // VCR-RA lever 3 base-CSE: const address → subword store off R11. + let (r_src, addr_mem) = if let Some(plan) = &base_cse + && let Some(&folded) = plan.fold.get(&addr.0) + { + let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs)?; + (r_src, crate::rules::MemAddr::imm(BASE_CSE_REG, folded)) + } else { + let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs)?; + let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs)?; + + // #382: fold a large static offset (> imm12) into the + // compile-time-constant base so the access immediate is 0. + let (base, mem_off) = fold_mem_offset(0x20000100, *offset); + let base_lo = (base & 0xFFFF) as u16; + let base_hi = ((base >> 16) & 0xFFFF) as u16; + arm_instrs.push(ArmOp::Movw { + rd: Reg::R12, + imm16: base_lo, + }); + arm_instrs.push(ArmOp::Movt { + rd: Reg::R12, + imm16: base_hi, + }); + arm_instrs.push(ArmOp::Add { + rd: Reg::R12, + rn: Reg::R12, + op2: Operand2::Reg(r_addr), + }); + (r_src, crate::rules::MemAddr::imm(Reg::R12, mem_off)) + }; let sub_op = match *width { 1 => ArmOp::Strb { rd: r_src, @@ -5285,6 +5530,174 @@ impl Default for OptimizerBridge { mod tests { use super::*; + // ---- base-CSE planner (VCR-RA lever 3, #468) ---- + + fn inst(op: Opcode) -> Instruction { + Instruction { + id: 0, + opcode: op, + block_id: 0, + is_dead: false, + } + } + fn vr(n: u32) -> OptReg { + OptReg(n) + } + /// `[Const addr, Const val, MemStore]` triples for `(addr, val)` pairs, using + /// fresh vregs per pair (single-use addresses, the foldable shape). + fn const_addr_stores(pairs: &[(i32, i32)]) -> Vec { + let mut out = Vec::new(); + for (i, (addr, val)) in pairs.iter().enumerate() { + let av = (i as u32) * 2; + let vv = av + 1; + out.push(inst(Opcode::Const { + dest: vr(av), + value: *addr, + })); + out.push(inst(Opcode::Const { + dest: vr(vv), + value: *val, + })); + out.push(inst(Opcode::MemStore { + src: vr(vv), + addr: vr(av), + offset: 0, + })); + } + out + } + + #[test] + fn plan_base_cse_folds_two_or_more_const_addr_stores() { + let ir = const_addr_stores(&[(0, 11), (4, 22), (8, 33)]); + let plan = plan_base_cse(&ir).expect("activates with 3 foldable stores"); + assert_eq!(plan.fold.len(), 3); + assert_eq!(plan.skip_const.len(), 3); + // addr vreg 0 → folded immediate 0; vreg 2 → 4; vreg 4 → 8. + assert_eq!(plan.fold.get(&0), Some(&0)); + assert_eq!(plan.fold.get(&2), Some(&4)); + assert_eq!(plan.fold.get(&4), Some(&8)); + } + + #[test] + fn plan_base_cse_declines_below_two_folds() { + let ir = const_addr_stores(&[(0, 11)]); + assert_eq!(plan_base_cse(&ir), None); + } + + #[test] + fn plan_base_cse_declines_on_disqualifying_op() { + // A Select anywhere needs R11 for the synthetic-select temp → decline. + let mut ir = const_addr_stores(&[(0, 11), (4, 22)]); + ir.push(inst(Opcode::Select { + dest: vr(100), + val_true: vr(101), + val_false: vr(102), + cond: vr(103), + })); + assert_eq!(plan_base_cse(&ir), None); + } + + #[test] + fn plan_base_cse_declines_on_control_flow() { + // A `CondBranch` (br_if) makes the function multi-block — outside v1 scope + // (and clear of the optimized path's separately-tracked multi-block bug). + let mut ir = const_addr_stores(&[(0, 11), (4, 22)]); + ir.push(inst(Opcode::CondBranch { + cond: vr(100), + target: 0, + })); + assert_eq!(plan_base_cse(&ir), None); + } + + #[test] + fn plan_base_cse_allows_trailing_structural_label() { + // A bare `Label` (the function-end marker with nothing branching to it) + // does NOT split control flow, so base-CSE still activates. + let mut ir = const_addr_stores(&[(0, 11), (4, 22)]); + ir.push(inst(Opcode::Label { id: 99 })); + let plan = plan_base_cse(&ir).expect("activates despite a structural label"); + assert_eq!(plan.fold.len(), 2); + } + + #[test] + fn plan_base_cse_declines_imm12_overflow_addr() { + // 0x1000 + 0 exceeds the imm12 window → that access does not fold; with + // only one other foldable store the function falls below threshold. + let ir = const_addr_stores(&[(0x1000, 11), (4, 22)]); + let plan = plan_base_cse(&ir); + // Only the (4,22) store folds → 1 fold → below the ≥2 threshold → None. + assert_eq!(plan, None); + } + + #[test] + fn plan_base_cse_declines_multi_use_addr() { + // An address vreg used by TWO stores is not single-use → not folded. + // (Both stores reuse addr vreg 0; the second's value is vreg 2.) + let ir = vec![ + inst(Opcode::Const { + dest: vr(0), + value: 4, + }), + inst(Opcode::Const { + dest: vr(1), + value: 11, + }), + inst(Opcode::MemStore { + src: vr(1), + addr: vr(0), + offset: 0, + }), + inst(Opcode::Const { + dest: vr(2), + value: 22, + }), + inst(Opcode::MemStore { + src: vr(2), + addr: vr(0), + offset: 0, + }), + ]; + // addr vreg 0 has use_count 2 → neither store folds → None. + assert_eq!(plan_base_cse(&ir), None); + } + + #[test] + fn plan_base_cse_folds_static_offset_into_immediate() { + // A non-zero static access offset folds into the immediate (ADDR + off). + let ir = vec![ + inst(Opcode::Const { + dest: vr(0), + value: 0, + }), + inst(Opcode::Const { + dest: vr(1), + value: 11, + }), + inst(Opcode::MemStore { + src: vr(1), + addr: vr(0), + offset: 16, + }), + inst(Opcode::Const { + dest: vr(2), + value: 0, + }), + inst(Opcode::Const { + dest: vr(3), + value: 22, + }), + inst(Opcode::MemStore { + src: vr(3), + addr: vr(2), + offset: 32, + }), + ]; + let plan = plan_base_cse(&ir).expect("activates"); + assert_eq!(plan.fold.get(&0), Some(&16)); // 0 + 16 + assert_eq!(plan.fold.get(&2), Some(&32)); // 0 + 32 + } + #[test] fn test_optimizer_bridge_basic() { let bridge = OptimizerBridge::new(); diff --git a/scripts/repro/base_cse_branch.wat b/scripts/repro/base_cse_branch.wat new file mode 100644 index 0000000..dd871e8 --- /dev/null +++ b/scripts/repro/base_cse_branch.wat @@ -0,0 +1,22 @@ +;; VCR-RA lever 3 base-CSE (#468, epic #242) — CONTROL-FLOW non-vacuity fixture. +;; +;; The base-CSE hoist materializes the linear-memory base into R11 once at entry. +;; R11 is OUTSIDE the range-reallocator's R0–R8 pool, so it is identity-preserved +;; across every straight-line segment — the single entry materialization must stay +;; valid across a branch. This fixture splits the constant-address stores with a +;; `br_if`, so the differential exercises the path where a hoisted base is USED +;; after a control-flow edge (the cross-segment hazard the R11 choice neutralizes, +;; and which a purely straight-line fixture could never surface). +;; +;; init_branch(sel): always stores fields 0,4; if sel!=0 also stores fields 8,12. +;; Generic — neutral addresses/values, tied to nothing real. +(module + (memory 1) + (export "memory" (memory 0)) + (func (export "init_branch") (param $sel i32) + (i32.store (i32.const 0) (i32.const 11)) + (i32.store (i32.const 4) (i32.const 22)) + (block $skip + (br_if $skip (i32.eqz (local.get $sel))) + (i32.store (i32.const 8) (i32.const 33)) + (i32.store16 (i32.const 12) (i32.const 44))))) diff --git a/scripts/repro/base_cse_differential.py b/scripts/repro/base_cse_differential.py new file mode 100644 index 0000000..eead0bd --- /dev/null +++ b/scripts/repro/base_cse_differential.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +"""VCR-RA lever 3 / VCR-ORACLE-001 (#468, #242) — EXECUTION-validate base-CSE. + +base-CSE (SYNTH_BASE_CSE=1) hoists the linear-memory base into R11 once at entry +and folds each constant store address into the access immediate (`str V,[R11,#ADDR]`), +dropping the per-access `movw/movt` base re-materialization and the address +materialization. The optimized (non-relocatable) path it changes has NO frozen +cargo byte-gate, so EXECUTION is the correctness oracle: this harness runs each +fixture under unicorn in BOTH flag-off and flag-on builds and asserts the resulting +LINEAR MEMORY is bit-identical to wasmtime ground truth. (These fixtures write +memory and return nothing, so memory — not a return register — is the observable.) + +Two fixtures: + * init_fields — 7 consecutive const-address stores (the straight-line #468 case; + the big byte win, and the non-vacuity case for "fold fired"). + * init_branch — const-address stores split by a `br_if`. R11 is outside the + range-reallocator pool (R0–R8) so the single entry materialization + must survive the branch; this is the control-flow case a + straight-line fixture cannot exercise. Run for sel=0 AND sel!=0. + +NON-VACUITY: aborts unless the flag-on init_fields build is strictly smaller than +flag-off (the base was actually hoisted) and the branch sweep covers both arms. + +Run (needs wasmtime + unicorn + pyelftools): + python scripts/repro/base_cse_differential.py +Exits nonzero on any mismatch or vacuity failure. +""" +import subprocess +import sys + +import wasmtime +from elftools.elf.elffile import ELFFile +from unicorn import UC_ARCH_ARM, UC_MODE_THUMB, Uc, UcError +from unicorn.arm_const import UC_ARM_REG_LR, UC_ARM_REG_R0, UC_ARM_REG_SP + +SYNTH = "./target/release/synth" +# The optimized path materializes this absolute linear-memory base. +LINMEM = 0x20000100 +CODE, STK, RET = 0x200000, 0x90000, 0x300000 +# Fields written by each fixture: (wasm_addr, width_bytes). +FIELDS = [(0, 4), (4, 4), (8, 4), (12, 2), (14, 2), (16, 1), (17, 1)] +BRANCH_FIELDS = [(0, 4), (4, 4), (8, 4), (12, 2)] + + +def compile_elf(wat, out, base_cse): + env = {"PATH": "/usr/bin:/bin"} + if base_cse: + env["SYNTH_BASE_CSE"] = "1" + r = subprocess.run( + [SYNTH, "compile", wat, "-o", out, "-b", "arm", "--target", "cortex-m4", + "--all-exports"], + capture_output=True, text=True, env={**env}, + ) + if r.returncode != 0: + sys.exit(f"compile failed ({wat}, base_cse={base_cse}): {r.stderr}") + + +def load(elf, func): + """Return (code_bytes, sh_addr, func_entry_offset_in_text).""" + f = ELFFile(open(elf, "rb")) + text = f.get_section_by_name(".text") + code, base = text.data(), text["sh_addr"] + if not code: + sys.exit(f"{elf}: .text empty") + # Find the function's address via the symbol table. synth emits the symtab + # with an empty section NAME, so look it up by section TYPE, not by ".symtab". + fa = None + for s in f.iter_sections(): + if s.header.sh_type == "SHT_SYMTAB": + for sym in s.iter_symbols(): + if sym.name == func: + fa = sym["st_value"] + break + if fa is None: + sys.exit(f"{elf}: symbol {func} not found") + return code, base, fa + + +def run_arm(elf, func, params): + code, base, fa = load(elf, func) + mu = Uc(UC_ARCH_ARM, UC_MODE_THUMB) + mu.mem_map(CODE, 0x10000) + mu.mem_map(LINMEM & ~0xFFFF, 0x20000) # covers LINMEM + field range + mu.mem_map(STK - 0x8000, 0x10000) + mu.mem_map(RET, 0x1000) + mu.mem_write(CODE, code) + mu.reg_write(UC_ARM_REG_SP, STK) + mu.reg_write(UC_ARM_REG_LR, RET | 1) + for i, p in enumerate(params): + mu.reg_write(UC_ARM_REG_R0 + i, p & 0xFFFFFFFF) + try: + mu.emu_start((CODE + fa - base) | 1, RET, count=100000) + except UcError as e: + return f"ERR:{e}" + out = {} + for (off, w) in (FIELDS if func == "init_fields" else BRANCH_FIELDS): + out[off] = int.from_bytes(mu.mem_read(LINMEM + off, w), "little") + return out + + +def wasm_mem(wat, func, params): + engine = wasmtime.Engine() + module = wasmtime.Module(engine, open(wat, "rb").read()) + store = wasmtime.Store(engine) + inst = wasmtime.Instance(store, module, []) + inst.exports(store)[func](store, *params) + mem = inst.exports(store)["memory"] + data = mem.read(store, 0, 64) + out = {} + for (off, w) in (FIELDS if func == "init_fields" else BRANCH_FIELDS): + out[off] = int.from_bytes(data[off:off + w], "little") + return out + + +def text_len(elf): + return len(ELFFile(open(elf, "rb")).get_section_by_name(".text").data()) + + +def check(label, wat, func, params, fails): + off_elf, on_elf = f"/tmp/bcse_{func}_off.elf", f"/tmp/bcse_{func}_on.elf" + gt = wasm_mem(wat, func, params) + r_off = run_arm(off_elf, func, params) + r_on = run_arm(on_elf, func, params) + ok = isinstance(r_off, dict) and isinstance(r_on, dict) and r_off == gt and r_on == gt + fails[0] += 0 if ok else 1 + flag = "" if ok else " <-- MISMATCH" + print(f"{label} {func}{tuple(params)}: off={'ERR' if not isinstance(r_off,dict) else 'ok'} " + f"on={'ERR' if not isinstance(r_on,dict) else 'ok'} vs wasmtime{flag}") + if not ok: + print(f" off={r_off}\n on ={r_on}\n wt ={gt}") + + +def main(): + # Compile both fixtures, both flag states. + compile_elf("scripts/repro/redundant_base_materialization.wat", + "/tmp/bcse_init_fields_off.elf", False) + compile_elf("scripts/repro/redundant_base_materialization.wat", + "/tmp/bcse_init_fields_on.elf", True) + compile_elf("scripts/repro/base_cse_branch.wat", + "/tmp/bcse_init_branch_off.elf", False) + compile_elf("scripts/repro/base_cse_branch.wat", + "/tmp/bcse_init_branch_on.elf", True) + + # Non-vacuity: the hoist must actually shrink the straight-line fixture. + off_len = text_len("/tmp/bcse_init_fields_off.elf") + on_len = text_len("/tmp/bcse_init_fields_on.elf") + if not on_len < off_len: + sys.exit(f"VACUOUS: flag-on init_fields .text ({on_len}B) not < flag-off " + f"({off_len}B) — base-CSE did not fire") + + fails = [0] + # ACTIVE case: a straight-line const-address-store function. base-CSE fires; + # assert flag-off == flag-on == wasmtime memory. + check("[straight]", "scripts/repro/redundant_base_materialization.wat", + "init_fields", [], fails) + + # DECLINE case: the optimized path's multi-block lowering is outside base-CSE's + # v1 scope, so the planner DISQUALIFIES any function with control flow. The + # correct, sufficient oracle is therefore that base-CSE is a NO-OP here: + # flag-on .text must be byte-identical to flag-off (base-CSE declined). This is + # what keeps the lever clear of the (separately-tracked) optimized-path + # multi-block bug — we don't execute it, we prove we never touched it. + with open("/tmp/bcse_init_branch_off.elf", "rb") as a: + off_text = ELFFile(a).get_section_by_name(".text").data() + with open("/tmp/bcse_init_branch_on.elf", "rb") as b: + on_text = ELFFile(b).get_section_by_name(".text").data() + if off_text == on_text: + print("[branch decline] init_branch: flag-on .text byte-identical to flag-off " + "(base-CSE correctly declined on control flow)") + else: + print("[branch decline] init_branch: FLAG-ON .text DIFFERS from flag-off " + "<-- base-CSE must NOT activate on control-flow functions") + fails[0] += 1 + + print(f"\ninit_fields .text {off_len}B -> {on_len}B (-{off_len - on_len}B, " + f"{100*(off_len-on_len)//off_len}%); base hoisted + addresses folded") + print("ORACLE: PASS" if fails[0] == 0 else f"ORACLE: FAIL ({fails[0]})") + sys.exit(1 if fails[0] else 0) + + +if __name__ == "__main__": + main() diff --git a/scripts/repro/redundant_base_materialization.md b/scripts/repro/redundant_base_materialization.md index d07e36f..a416338 100644 --- a/scripts/repro/redundant_base_materialization.md +++ b/scripts/repro/redundant_base_materialization.md @@ -1,10 +1,9 @@ # #468 scoping spike — redundant linear-memory-base materialization **Issue:** synth#468 · **Epic:** #242 (VCR-*) · north-star #390 -**Status:** SCOPING SPIKE (no codegen change — frozen-safe by construction). -The byte-changing CSE is the explicitly-separate next gated step (flag-off → -on-target cycle gate → default-on flip), exactly like the cmp→select / -local-promotion / immediate-shift levers. +**Status:** IMPLEMENTED flag-off (`SYNTH_BASE_CSE`) — see "Implemented" below. +Default-on flip held for the on-target cycle gate, exactly like the cmp→select / +local-promotion / immediate-shift / dead-frame levers. ## The pattern @@ -90,14 +89,52 @@ only result checks are the out-of-CI unicorn differential + the on-target gate. relocatable lowering), not a free property — and the optimized-path result evidence is the differential, not a cargo test. -## Next gated step (separate PR) - -1. Flag-off CSE in `optimizer_bridge.rs` (`SYNTH_BASE_CSE`, default off ⇒ - bit-identical optimized path) — reserve a callee-saved base reg, hoist - `movw/movt` once per const-base store-run, rewrite stores to `[base,#off]`. - Soundness: only runs of stores whose base is the same compile-time constant - with no intervening base clobber; base reg dead-after / restored in epilogue. -2. Differential (unicorn): optimized-path ELF, flag-off == flag-on == wasmtime; - the 7-store fixture is the non-vacuity case. -3. On-target cycle gate (same protocol as the prior levers), then default-on flip - + re-freeze any optimized-path goldens it touches. +## Implemented — `SYNTH_BASE_CSE` (flag-off) + +The scoping spike above anticipated "reserve a callee-saved base reg, hoist +`movw/movt` once per straight-line run." The implementation found a cleaner, +stronger invariant and is **simpler** than the per-run plan: + +* **R11 is realloc-immune.** `reallocate_function`'s pool is `R0–R8`; it + identity-preserves everything outside it. So the base lives in **R11**, + materialized **once at function entry**, and survives every later segment + untouched — no per-run re-materialization, no cross-segment remap hazard. R11 + is also outside local promotion's `R4–R8` pool and is not the encoder scratch + (R12). It is reserved from every optimized-path allocator via + `param_reserved_regs` + the const pool. +* **Const addresses fold into the access immediate.** `i32.store (i32.const ADDR) + V` → `str V,[R11,#ADDR+off]` (planner-verified single-use const address, + `ADDR+off ≤ imm12`), dropping the per-access `movw/movt` base **and** the now- + dead address materialization — the latter is the register-pressure relief that + makes the reserved base a net win, not a wash. +* **A standalone planner** (`plan_base_cse`, unit-tested) decides activation: + ≥2 foldable const-address accesses AND every opcode in the base-CSE-safe set. + Any `Branch`/`CondBranch` (multi-block), `Select`, `Global*`, `MemorySize/Grow`, + `Call`, i64, or unenumerated op declines the whole function (`None` → unchanged + per-access codegen). v1 is therefore confined to single-basic-block field + initializers — #468's exact target — keeping it clear of the optimized path's + separately-tracked multi-block lowering. + +Result on `init_fields`: **.text 336 B → 218 B (−118 B, −35 %)**, base +materialized once, all 7 addresses folded, matching the relocatable path's +`str [fp,#off]` shape. + +### Oracle (this path has NO cargo byte-gate — frozen gate compiles `--relocatable`) + +1. **Flag-off bit-identical** — verified by an explicit `.text` diff of a fixture + corpus against a pre-change baseline binary (4/4 identical), plus the full + optimized-path test suite (`wast_compile` et al.) green. base-CSE is `None` + when the flag is unset, so off ⇒ byte-identical by construction. +2. **Differential** (`base_cse_differential.py`, unicorn): `init_fields` + flag-off == flag-on == wasmtime by comparing **linear memory** (the fixture + returns nothing); `init_branch` asserts flag-on `.text` byte-identical to + flag-off (base-CSE correctly **declines** on control flow). +3. **On-target cycle gate** (same protocol as the prior levers), then default-on + flip via a `SYNTH_NO_BASE_CSE` opt-out — **held for silicon**. + +### Follow-ups +* Multi-block support (needs the optimized path's `block`/`br_if` lowering fixed + first — `init_branch` flag-off already miscompiles, independent of base-CSE; a + separate optimized-path control-flow bug worth its own issue). +* Dynamic (non-const) addresses in an active function could still source the base + from R11 (`add R12,R11,r_addr`) instead of re-materializing — deferred. diff --git a/scripts/repro/redundant_base_materialization.wat b/scripts/repro/redundant_base_materialization.wat index d09bd2e..0c579e5 100644 --- a/scripts/repro/redundant_base_materialization.wat +++ b/scripts/repro/redundant_base_materialization.wat @@ -12,6 +12,7 @@ ;; Generic addresses/values — exhibits the pattern, tied to nothing real. (module (memory 1) + (export "memory" (memory 0)) (func (export "init_fields") (i32.store (i32.const 0) (i32.const 11)) (i32.store (i32.const 4) (i32.const 22))